Pandas Dataframe的一些操作
读取txt
# ' 和 " 没区别,路径为 \ ,./ 表示为当前目录下路径
df = pd.read_csv("./input/"+year+"ha.txt",skiprows =13,skipfooter=2,header=None,sep=" ")
#skiprows=[0,2,5]跳过指定行 skiprows=[i for i in range(1,3)]跳过除首行外前n行
#skipfooter=2 跳过末尾n行
#skiprows= lambda x: logic(x) 跳过符合条件的行 其中logic为设定的筛选函数,可参照如下设置
def logic(index):
if index % 3 == 0:
return True
return False
读取dbf
import pandas as pd from simpledbf import Dbf5 #未下载需下载pip install simpledbf dbf = Dbf5(r"D:\01DATA\dem.dbf") df = dbf.to_dataframe() df['Id']=df.index
#删除列 del df[0] del df[5]
del df['name'] df=df.iloc[0:221]#选择前221行 df.columns = ['location_code','Hub','Autho']#给列重命名
df3.columns=['Eigenvec']
df3=df3['Eigenvec'].str.split(' ', expand=True)#在某列内部以空格进行分割
df3['new'] = df3.index#创建新列 和索引列相同
loca = df3['new'].str.split(' ', expand=True)
result = pd.merge(df1, df2, how='inner', on=['location_code'])#公共列为'location_code' 取交集 result = pd.merge(result, df3, how='inner', on=['location_code'])
result.to_excel('./output/worldtrade_'+year+'_all.xlsx',sheet_name='Sheet')#导出
编码错误
df_w = pd.read_table(r'C:\Users\lab\list.txt', encoding='ISO-8859-1')#\加r #encoding = 'utf-8'
将多个dataframe存到一个excel中不同sheet中:
from pandas import ExcelWriter
fields=['degree','weighted degree','Eigenvec','Autho','Hub']
years=['1992','2004','2009','2015','2018']
for year in years:
writer = ExcelWriter('./output/n_'+year+'.xlsx')
#writer = pd.ExcelWriter('./output/n_1992.xlsx', engine='xlsxwriter')
for field in fields:
df = pd.read_excel('./output/'+year+'.xlsx')
#field='degree'
df_5=df.sort_values(by=field, ascending=False).head(5)#对某列按照字段降序排列
df_n=df.loc[df['Label'].isin(['IND','CHN','RUS','BEL','ZAF'])]#选出符合条件的rows
df_=pd.concat([df_5,df_n])
df_=df_.sort_values(by=field, ascending=False)
df1=df_[['Label',field]]#提取出列
df1['Index']=df1[field]/df1[field].max()*100
df2 = df1.drop_duplicates(subset='Label')#除去重复的行 根据Label属性列
df2.to_excel(writer,sheet_name=field)#sheet的名字和field一致
writer.save()#注意保存放置的位置 需要都存好后再保存
根据某一类进行排序,相等值序列相同,同时需要某些固定的值:
from pandas import ExcelWriter
fields=['degree','weighted degree','Eigenvec','Autho','Hub']
years=['1992','2004','2009','2015','2018']
for year in years:
writer = ExcelWriter('./output5/sort/n_'+year+'.xlsx')
#writer = pd.ExcelWriter('./output5/sort/n_1992.xlsx', engine='xlsxwriter')
for field in fields:
df = pd.read_excel('./output5/'+year+'.xlsx')
dfs=df.sort_values(by=field, ascending=False).reset_index(drop=True)#更新index
dfs['sort_index']=dfs.index+1
dfs=dfs[['Label',field,'sort_index']]
for ind in dfs.index:
if ind > 0:
if dfs[field][ind]==dfs[field][ind-1]:
dfs['sort_index'][ind]=dfs['sort_index'][ind-1]
dfs['sort_index'][ind+1:]=dfs['sort_index'][ind+1:]-1
df_5=df.sort_values(by=field, ascending=False).head(5)
df_n=df.loc[df['Label'].isin(['IND','CHN','RUS','BEL','ZAF'])]
df_=pd.concat([df_5,df_n])
df_=df_.sort_values(by=field, ascending=False)
df1=df_[['Label',field]]
df1['Index/%']=df1[field]/df1[field].max()*100
df2 = df1.drop_duplicates(subset='Label')
df2['sort_index'] = df2['Label'].map(dfs.set_index('Label')['sort_index'])#匹配dfs(多)中的'sort_index',匹配字段为Label https://stackoverflow.com/questions/46789098/create-new-column-in-dataframe-with-match-values-from-other-dataframe
df2 = df2[[field, 'sort_index', 'Label','Index/%']]#按照想的给列排序导出
df2['Index/%']=df2['Index/%'].round(decimals=2)#对这一列保留小数点后两位小数
df2.to_excel(writer,sheet_name=field)
writer.save()
将一文件夹下所有的xlsx文件转为csv以方便在ArcGIS中使用:
import pandas as pd
import glob
# folder C:\Users\admin\
for filepath in glob.iglob(r'C:\Users\admin\*.xlsx'):
data_xlsx = pd.read_excel(filepath, 'Sheet1', index_col=None)
data_xlsx.to_csv(filepath[:-5]+'.csv', encoding='utf-8')
#filepath[:-5]表示文件路径去除后五位,即'.xlsx'后的文件名
print(filepath)
其他遍历文件下所有文件的代码可移步参考:https://www.newbedev.com/python/howto/how-to-iterate-over-files-in-a-given-directory/
按规则选择行
df.loc[df['column_name'] == some_value] df.loc[df['column_name'].isin(some_values)] df.loc[~df['column_name'].isin(some_values)] df.loc[(df['column_name'] >= A) & (df['column_name'] <= B)]
创建dataframe 并增加新的行
import pandas as pd
# create an Empty DataFrame
# object With column names only
df = pd.DataFrame(columns = ['Name', 'Articles', 'Improved'])
print(df)
# append rows to an empty DataFrame
df = df.append({'Name' : 'Ankit', 'Articles' : 97, 'Improved' : 2200},
ignore_index = True)
df = df.append({'Name' : 'Aishwary', 'Articles' : 30, 'Improved' : 50},
ignore_index = True)
df = df.append({'Name' : 'yash', 'Articles' : 17, 'Improved' : 220},
ignore_index = True)
df
统计某列数据各个值出现的频次:
import pandas as pd
import glob
from pandas import ExcelWriter
dft = pd.DataFrame(columns = ['USA', 'CHN', 'JPN','IND','ZAF'])
fields=['maxtrade_partner_code','maxex_partner_code','maxim_partner_code']
# folder C:\Users\admin\
writer = ExcelWriter('G:/out.xlsx')
for field in fields:
print(field)
for filepath in glob.iglob(r'D:\1Money\ylh工作安排9-1\*.xlsx'):#遍历文件夹所有xlsx文件
print(filepath)
df = pd.read_excel(filepath, 'Sheet 1', index_col=None)
idx = pd.Index(df[field]) #索引
co=pd.DataFrame(idx.value_counts()) #统计频次转为 dataframe
dft = dft.append({'USA':co.at['USA',field],'CHN' :co.at['CHN',field],'JPN':co.at['JPN',field] ,'IND' :co.at['IND',field],'ZAF':co.at['ZAF',field]},ignore_index = True)
dft.to_excel(writer,sheet_name=field)
dft.drop(dft.index, inplace=True)
writer.save()
将dataframe中的负值替换为其他
df[df < 0] = np.nan
简单数据探索
import seaborn as sns import matplotlib.pyplot as plt f, ax = plt.subplots(figsize=(20, 7))#设置图片尺寸 sns.displot(df['RASTERVALU']);#对这一列数据频率直方图显示 sns.lineplot(x=df['Id'],y=df['RASTERVALU']);#折线图

浙公网安备 33010602011771号