利用Python(copy_from)将beijing geolife trajectories txt文件批量导入到 PostgresSQL(PostGIS)
import skmob.io.file as file# https://scikit-mobility.github.io/scikit-mobility/
import psycopg2
import os
import pandas as pd
from io import StringIO
import numpy as np
def table_exist(table_name=None, conn=None, cur=None):
try:
cur.execute("select to_regclass(" + "\'" + table_name + "\'" + ") is not null")
rows = cur.fetchall()
except Exception as e:
rows = []
conn.close()
if rows:
data = rows
flag = data[0][0]
return flag
def geolif2csv(user_ids, path_to_geolife_data_dir, csv_file_dir, compress_radius_km):
for ind in range(len(user_ids)):
user_id = user_ids[ind: ind + 1]
TrajData = file.load_geolife_trajectories(path_to_geolife_data_dir=path_to_geolife_data_dir,
user_ids=user_id,
filter_kwargs={},
compress_kwargs={'spatial_radius_km': compress_radius_km})
# 列表
Trjs_name_list = ['lat', 'lon', 'datatime', 'uid']
# list转dataframe
df = pd.DataFrame(TrajData.values, columns=Trjs_name_list)
csv_name = user_id[0] + ".csv"
csv_path = os.path.join(csv_file_dir, csv_name)
# 保存到
df.to_csv(csv_path, header=False, index=False, encoding="utf-8")
path_to_geolife_data_dir = r'D:/data/Geolife Trajectories 1.3/'
csv_file_dir = "D:/data/Geolife Trajectories 1.3/Data_csv"
user_ids = os.listdir(path_to_geolife_data_dir + "Data")
pgisCon = psycopg2.connect(database="beijing", user="jiangshan", password="xxxx", host="192.168.x.xxx", port="5432")
pgisCursor = pgisCon.cursor()
table_name = "geo_life_table"
the_geom_SRID = "4326"
# CREATE TABLE IF table IS NOT EXIST
# 查询出来的表是否存在的状态,存在则为True,不存在则为False
table_flg = table_exist(table_name, pgisCon, pgisCursor)
if table_flg is False:
sql = "DROP TABLE public.{0} CASCADE".format(table_name)# -- 删除表
sql = "CREATE TABLE IF NOT EXISTS {0} (lat DOUBLE PRECISION, lon DOUBLE PRECISION, tstamp TIMESTAMP, u_id TEXT)".format(table_name)
pgisCursor.execute(sql)
pgisCon.commit()
print("Geolife Data To CSV.......")
compress_radius_km = 0.005
geolif2csv(user_ids, path_to_geolife_data_dir, csv_file_dir, compress_radius_km)
print("Geolife Data To CSV OK !!!")
# 插入数据
print('IMPORT FILES......') # copy_from 不支持 GEOMETRY对象批量导入
txt_files = os.listdir(csv_file_dir)
col_names = ['lat', 'lon', 'datatime', 'uid']
dtype_dic = {'lat': object, 'lon': object, 'datatime': object, 'uid': str}
uid_list = []
ExceptionDataFile = []
for txt in txt_files:
# print(txt)
txt_path = os.path.join(csv_file_dir, txt)
data = pd.read_csv(txt_path, header=None, names=col_names, dtype=dtype_dic)
shape = data.shape
if shape[0] >= 5:
data = data.round(decimals=7)
u_id = np.unique(data.uid.values)[0]
uid_list.append(u_id)
# dataframe类型转换为IO缓冲区中的str类型
output = StringIO()
data.to_csv(output, sep='\t', index=False, header=False)
output = output.getvalue()
pgisCursor.copy_from(StringIO(output), table_name)
pgisCon.commit()
else:
ExceptionDataFile.append(txt)
print('IMPORT FILES OK!!') # copy_from 不支持 GEOMETRY对象批量导入
print('ADD A GEOMETRY COLUMN......')
# ADD A GEOMETRY COLUMN
pgisCursor.execute("alter table " + table_name + " add the_geom GEOMETRY")
pgisCon.commit()
print('UPDATE THE GEOMETRY.....')
# UPDATE THE GEOMETRY
for uid in uid_list:
sql = "UPDATE " + table_name + " set the_geom=st_geomfromtext(\'POINT(\'|| lon ||' '|| lat ||\')\',\'"+the_geom_SRID+"\') where u_id = \'{0}\'".format(uid)
pgisCursor.execute(sql)
pgisCon.commit()
print("Exception Data File:", ExceptionDataFile)
print('CREATE INDEX OF THE GEOMETRY.....')
sql = "CREATE INDEX {0} ON t_drive USING gist ({1})".format("pt_idx", "the_geom")
pgisCursor.execute(sql)
pgisCon.commit()
pgisCursor.close()
pgisCon.close()
print('done')
个人学习记录

浙公网安备 33010602011771号