网站更新内容:请访问: https://bigdata.ministep.cn/

IO数据读取存储

csv数据读取

import pandas as pd
from datetime import datetime
import time

def read_csv(from_path):
    table = pd.read_csv(from_path) ##选取表
    return table
def write_csv(to_path,data):
    data.to_csv(to_path,index=False)
    
if __name__ == "__main__":
    start_time = time.time() # 开始时间
    path = 'C:/Users/lenovo/Desktop'
    file_name ='query_hive_162768.csv'
    from_path = path+"/"+file_name
    print(from_path)
    df = read_csv(from_path)
    df.head()
    end_time = time.time() #结束时间
    print("程序耗时%f秒." % (end_time - start_time))
    print(df.head())

Excel保存

import re
import pandas  as pd
from datetime import datetime
import time

def read_xlsx(path,sheet_name):
    xlsx_file = pd.ExcelFile(path) ##路径
    table = xlsx_file.parse(sheet_name) ##选取表
    return table

if __name__ == "__main__":
    start_time = time.time() # 开始时间
    path = 'C:/Users/lenovo/Desktop'
    file_name ='bdl_goods_income.xlsx'
    sheet_name_list = {
    'hive':'Sheet',
    'email':'data',
    'mysql':'Sheet4'
    }
    
    path = path+"\\"+file_name
    sheet_name = sheet_name_list['email']
    #sheet_name = sheet_name_list['email']    
    df = read_xlsx(path,sheet_name)
    print(df.head())
    end_time = time.time() #结束时间
    print("程序耗时%f秒." % (end_time - start_time))

Excel 读取

import re
import pandas  as pd
from datetime import datetime
import time

def read_xlsx(path,sheet_name):
    xlsx_file = pd.ExcelFile(path) ##路径
    table = xlsx_file.parse(sheet_name) ##选取表
    return table

if __name__ == "__main__":
    start_time = time.time() # 开始时间
    path = 'C:/Users/lenovo/Desktop'
    file_name ='华东交付盘质检日报12月.xlsx'
    sheet_name_list = {
    'hive':'Sheet',
    'email':'data',
    'mysql':'Sheet4'
    }
    
    path = path+"\\"+file_name
    sheet_name = sheet_name_list['hive']
    #sheet_name = sheet_name_list['email']    
    df = read_xlsx(path,sheet_name)
    print(df.head())
    end_time = time.time() #结束时间
    print("程序耗时%f秒." % (end_time - start_time))

存取到mysql

import pymysql
from sqlalchemy import create_engine
import mysql.connector as sql
def py_mysql(table,table_name):
    try:
        engine = create_engine("mysql+pymysql://root:password@127.0.0.1:3306/temp?charset=utf8mb4")
        table.to_sql(name=table_name,con=engine,if_exists='replace',index=False,chunksize=10000)
        print ('数据库写入成功')
    except :
        print ('数据库写入失败')
if __name__ == "__main__":
    start_time = time.time() # 开始时间
    try:
        py_mysql(df,'点位1206')
    except:
        print('sql查询失败')
    end_time = time.time() #结束时间
    print("程序耗时%f秒." % (end_time - start_time))



循环读取目录Csv文件

import os,re
import pandas as pd
 
dir_path = 'C:/Users/lenovo/Desktop/inv_2'
 
#col = [] ##选取需要的字段,看了下不是所有的字段都需要的
#df = pd.DataFrame([],columns=col)

df = pd.DataFrame([])
 
for root,dirs,files in os.walk(dir_path):##文件夹的路径
    if files: ##判断是否有文件
        for file_name in files: ##循环文件的名称
            if '.csv' in file_name: #判定是不是文件是否有o2o_order结尾的文件,是的就继续,不是的就退出了;
                path = os.path.join(root,file_name)
                print('正在处理的文件是%s'%(path))
                try:
                    data = pd.read_table(path,sep=',') ##路径,注意指定文件的分隔符号
                    df_tmp = pd.DataFrame(data)
                    print(df_tmp.head())
                    df = pd.concat([df,df_tmp], ignore_index=True) ## 数据合并
                    print(df.shape)
                except:
                    print('读取文件,处理数据失败')
            else:
                print('warning:非.csv文件不读取')
 
#print(df.head())
 
#保存到本地
 
out_path= 'C:/Users/lenovo/Desktop/inv_2/output-1.xlsx'
 
writer = pd.ExcelWriter(out_path, engine='xlsxwriter')
 
df.to_excel(writer,'Sheet1')
 
writer.save()

循环读取目录Xlsx文件

import os,re
import pandas as pd
 
dir_path = 'C:/Users/lenovo/Desktop/email'

#col = [] ##选取需要的字段,看了下不是所有的字段都需要的
#df = pd.DataFrame([],columns=col)

df = pd.DataFrame([])
 
for root,dirs,files in os.walk(dir_path):##文件夹的路径
    if files: ##判断是否有文件
        for file_name in files: ##循环文件的名称
            if '.xls' in file_name: #判定是不是文件是否有o2o_order结尾的文件,是的就继续,不是的就退出了;
                path = os.path.join(root,file_name)
                print('正在处理的文件是%s'%(path))
                try:
                    xlsx_file = pd.ExcelFile(path) ##路径
                    data = xlsx_file.parse('data') ##选取表
                    df_tmp = pd.DataFrame(data)
                    print(df_tmp.head())
                    df = pd.concat([df,df_tmp], ignore_index=True, sort=True) ## 数据合并
                    print(df.shape)
                except:
                    print('读取文件,处理数据失败')
            else:
                print('warning:非.xlsx文件不读取')
 
#print(df.head())
 
#保存到本地
 
out_path= 'C:/Users/lenovo/Desktop/email/output-1.xlsx'
 
writer = pd.ExcelWriter(out_path, engine='xlsxwriter')
 
df.to_excel(writer,'Sheet1')
 
writer.save()

读取剪贴板数据

# 读取数据
import pandas as pd
df=pd.read_clipboard()
posted @ 2021-04-04 20:13  ministep88  阅读(135)  评论(0)    收藏  举报
网站更新内容:请访问:https://bigdata.ministep.cn/