Python 爬虫-数据存储

json支持

json全称是Java Script Object Notation,即Javascript对象符号。是一种轻量级的数据交换格式。

dumps()和dump()的encode操作(将python对象转换为json字符串)。

import json

# 将python对象转换为json字符串
s = json.dumps(['yeeku', {'favorite': ('coding', None, 'game', 25)}])
print(s)  # ["yeeku", {"favorite": ["coding", null, "game", 25]}]
# 将python字符串转化为json字符串
s2 = json.dumps("\"foo\bar")
print(s2)  # "\"foo\bar"
# 将python的dict对象转换为json字符串,并对key排序。
s3 = json.dumps({"c": 0, 'b': 0, 'a': 0}, sort_keys=True)
print(s3)  # {"a": 0, "b": 0, "c": 0}
# 将python列表转换为json字符串,并指定json分隔符,在逗号和冒号之后没有空格
s4 = json.dumps([1,2,3,{'x':5,'y':7}],separators=(',',':'))
print(s4) # [1,2,3,{"x":5,"y":7}]
# 指定indent为4,意味着转换的json字符串有缩进
s5 = json.dumps({'python':5,'kotlin':7},sort_keys=True,indent=4)
print(s5)
# {
#     "kotlin": 7,
#     "python": 5
# }
# 使用JSONEncoder的encode方法将python对象转换为json字符串
s6 = json.JSONEncoder().encode({'name':('憨八龟','jim')})
print(s6) # {"name": ["\u61a8\u516b\u9f9f", "jim"]}
f = open('a.json','w')
# 使用dump()函数将转换得到的json字符串输出到文件
json.dump(['kotlin',{'python':'nice'}],f)

loads()和load()函数的decode操作(将json字符串转换成python对象)。

# 将json字符串恢复成python列表
result1 = json.loads('["yeeku", {"favorite": ["coding", null, "game", 25]}]')
print(result1) # ['yeeku', {'favorite': ['coding', None, 'game', 25]}]
# 将json字符串恢复成python字符串
result2 = json.loads('"\\"foo\\"bar"')
print(result2)  # "foo"bar
# 从文件流恢复json列表
f = open('a.json')
result3 = json.load(f)

CSV文件处理:

CSV文件读取的两种方式:

import csv

# 这种方式读取到的每一条数据是一个列表,所以需要通过下标的方式获取具体某一个值
# with open("stock.csv",'r',encoding='gbk') as fp:
#     reader = csv.reader(fp)
#     for x in reader:
#         print(x[3])

# 这种方式读取到的每一条数据是一个字典,所以可以通过列名获取数据
with open("stock.csv",'r',encoding='gbk') as fp:
    reader = csv.DictReader(fp)
    for x in reader:
        print(x['secShortName'])

CSV文件的写入的两种方式:

import csv

headers = ('name','age','height')
# students = [
#     ("张三",18,180),
#     ("李四",19,190),
#     ("王五",20,170)
# ]
students = [
    {"name":"张三","age":18,"height":180},
    {"name":"李四","age":19,"height":190},
    {"name":"王五","age":20,"height":170}
]

# with open("students.csv",'w',encoding='utf-8',newline='') as fp:
#     writer = csv.writer(fp)
#     writer.writerow(headers)
#     writer.writerows(students)


with open("students.csv",'w',encoding='utf-8',newline='') as fp:
    writer = csv.DictWriter(fp,headers)
    # 虽然DictWriter创建的时候有一个headers,但是想要写入数据进去,还是需要调用
    # writer.writeheader()方法,否则,表头数据写入不进去
    writer.writeheader()
    writer.writerows(students)

Excel文件处理:

Sheet相关的操作:

workbook = xlrd.open_workbook("成绩表.xlsx")

# 获取所有的sheet名字
# print(workbook.sheet_names())

# 根据索引获取指定的sheet对象
# sheet = workbook.sheet_by_index(1)
# print(sheet.name)

# 根据名称获取指定的sheet对象
# sheet = workbook.sheet_by_name("2班")
# print(sheet.name)

# 获取所有的sheet对象
# sheets = workbook.sheets()
# for sheet in sheets:
#     print(sheet.name)

# 获取指定sheet的行数和列数
sheet = workbook.sheet_by_index(0)
print({"rows":sheet.nrows,"cols":sheet.ncols})

Cell相关的操作:

from xlrd.sheet import Cell
sheet = workbook.sheet_by_index(0)
cell = sheet.cell(1,1)
print(cell.value)

# cells = sheet.row_slice(1,1,4)
# for cell in cells:
#     print(cell.value)

# cells = sheet.col_slice(0,1,sheet.nrows)
# for cell in cells:
#     print(cell.value)

# cell_value = sheet.cell_value(0,1)
# print(cell_value)

# cell_values = sheet.col_values(1,1,sheet.nrows)
# print(cell_values)

# cell_values = sheet.row_values(1,1,sheet.ncols)
# print(cell_values)

Cell中常用的数据类型:

sheet = workbook.sheet_by_index(0)
# cell = sheet.cell(0,0)
# print(cell.ctype)
# print(xlrd.XL_CELL_TEXT)

# cell = sheet.cell(1,1)
# print(cell.ctype)
# print(xlrd.XL_CELL_NUMBER)

# cell = sheet.cell(19,0)
# print(cell.ctype)
# print(xlrd.XL_CELL_DATE)

# cell = sheet.cell(19,0)
# print(cell.ctype)
# print(xlrd.XL_CELL_BOOLEAN)


cell = sheet.cell(1,1)
print(cell.ctype)
print(xlrd.XL_CELL_EMPTY)
posted @ 2021-03-29 19:23  KKKyrie  阅读(124)  评论(0编辑  收藏  举报