Python 爬虫-数据存储
json支持
json全称是Java Script Object Notation,即Javascript对象符号。是一种轻量级的数据交换格式。
dumps()和dump()的encode操作(将python对象转换为json字符串)。
import json
# 将python对象转换为json字符串
s = json.dumps(['yeeku', {'favorite': ('coding', None, 'game', 25)}])
print(s) # ["yeeku", {"favorite": ["coding", null, "game", 25]}]
# 将python字符串转化为json字符串
s2 = json.dumps("\"foo\bar")
print(s2) # "\"foo\bar"
# 将python的dict对象转换为json字符串,并对key排序。
s3 = json.dumps({"c": 0, 'b': 0, 'a': 0}, sort_keys=True)
print(s3) # {"a": 0, "b": 0, "c": 0}
# 将python列表转换为json字符串,并指定json分隔符,在逗号和冒号之后没有空格
s4 = json.dumps([1,2,3,{'x':5,'y':7}],separators=(',',':'))
print(s4) # [1,2,3,{"x":5,"y":7}]
# 指定indent为4,意味着转换的json字符串有缩进
s5 = json.dumps({'python':5,'kotlin':7},sort_keys=True,indent=4)
print(s5)
# {
# "kotlin": 7,
# "python": 5
# }
# 使用JSONEncoder的encode方法将python对象转换为json字符串
s6 = json.JSONEncoder().encode({'name':('憨八龟','jim')})
print(s6) # {"name": ["\u61a8\u516b\u9f9f", "jim"]}
f = open('a.json','w')
# 使用dump()函数将转换得到的json字符串输出到文件
json.dump(['kotlin',{'python':'nice'}],f)
loads()和load()函数的decode操作(将json字符串转换成python对象)。
# 将json字符串恢复成python列表
result1 = json.loads('["yeeku", {"favorite": ["coding", null, "game", 25]}]')
print(result1) # ['yeeku', {'favorite': ['coding', None, 'game', 25]}]
# 将json字符串恢复成python字符串
result2 = json.loads('"\\"foo\\"bar"')
print(result2) # "foo"bar
# 从文件流恢复json列表
f = open('a.json')
result3 = json.load(f)
CSV文件处理:
CSV文件读取的两种方式:
import csv
# 这种方式读取到的每一条数据是一个列表,所以需要通过下标的方式获取具体某一个值
# with open("stock.csv",'r',encoding='gbk') as fp:
# reader = csv.reader(fp)
# for x in reader:
# print(x[3])
# 这种方式读取到的每一条数据是一个字典,所以可以通过列名获取数据
with open("stock.csv",'r',encoding='gbk') as fp:
reader = csv.DictReader(fp)
for x in reader:
print(x['secShortName'])
CSV文件的写入的两种方式:
import csv
headers = ('name','age','height')
# students = [
# ("张三",18,180),
# ("李四",19,190),
# ("王五",20,170)
# ]
students = [
{"name":"张三","age":18,"height":180},
{"name":"李四","age":19,"height":190},
{"name":"王五","age":20,"height":170}
]
# with open("students.csv",'w',encoding='utf-8',newline='') as fp:
# writer = csv.writer(fp)
# writer.writerow(headers)
# writer.writerows(students)
with open("students.csv",'w',encoding='utf-8',newline='') as fp:
writer = csv.DictWriter(fp,headers)
# 虽然DictWriter创建的时候有一个headers,但是想要写入数据进去,还是需要调用
# writer.writeheader()方法,否则,表头数据写入不进去
writer.writeheader()
writer.writerows(students)
Excel文件处理:
Sheet相关的操作:
workbook = xlrd.open_workbook("成绩表.xlsx")
# 获取所有的sheet名字
# print(workbook.sheet_names())
# 根据索引获取指定的sheet对象
# sheet = workbook.sheet_by_index(1)
# print(sheet.name)
# 根据名称获取指定的sheet对象
# sheet = workbook.sheet_by_name("2班")
# print(sheet.name)
# 获取所有的sheet对象
# sheets = workbook.sheets()
# for sheet in sheets:
# print(sheet.name)
# 获取指定sheet的行数和列数
sheet = workbook.sheet_by_index(0)
print({"rows":sheet.nrows,"cols":sheet.ncols})
Cell相关的操作:
from xlrd.sheet import Cell
sheet = workbook.sheet_by_index(0)
cell = sheet.cell(1,1)
print(cell.value)
# cells = sheet.row_slice(1,1,4)
# for cell in cells:
# print(cell.value)
# cells = sheet.col_slice(0,1,sheet.nrows)
# for cell in cells:
# print(cell.value)
# cell_value = sheet.cell_value(0,1)
# print(cell_value)
# cell_values = sheet.col_values(1,1,sheet.nrows)
# print(cell_values)
# cell_values = sheet.row_values(1,1,sheet.ncols)
# print(cell_values)
Cell中常用的数据类型:
sheet = workbook.sheet_by_index(0)
# cell = sheet.cell(0,0)
# print(cell.ctype)
# print(xlrd.XL_CELL_TEXT)
# cell = sheet.cell(1,1)
# print(cell.ctype)
# print(xlrd.XL_CELL_NUMBER)
# cell = sheet.cell(19,0)
# print(cell.ctype)
# print(xlrd.XL_CELL_DATE)
# cell = sheet.cell(19,0)
# print(cell.ctype)
# print(xlrd.XL_CELL_BOOLEAN)
cell = sheet.cell(1,1)
print(cell.ctype)
print(xlrd.XL_CELL_EMPTY)