day_4:文本存储_1
TXT
from pyquery import PyQuery import re import json import requests def get_html(url): headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'} try: r = requests.get(url, headers=headers) r.raise_for_status() return r.text except: print('status_code is not 200') return None def parse_time(str): txt = re.search('\d{4}(-\d{2}-\d{2})*', str) return txt.group() def parse_html(html, f): doc = PyQuery(html) dd_nodes = doc('dl.board-wrapper') ranks = dd_nodes('.board-index').items() names = dd_nodes('.name').items() actors = dd_nodes('.star').items() times = dd_nodes('.releasetime').items() integers = dd_nodes('.integer').items() fractions = dd_nodes('.fraction').items() for rank, name, actor, ts, integer, fraction in zip(ranks, names, actors, times, integers, fractions): str = '\n'.join([rank.text(), name.text(), actor.text().replace('主演:', ''), parse_time(ts.text()), integer.text() + fraction.text()]) if __name__ == '__main__': url = 'http://maoyan.com/board/4' with open('movie.txt', 'w') as f: for i in range(10): path = url + '?offset=' + str(i*10) print(path) html = get_html(path) if html: parse_html(html, f)
JSON
json.loads(str)把字符串转为JSON对象
json.dumps(JSON, indent=2, ensure_ascii=False)把JSON对象转换为字符串
indent=2设置格式,2代表缩进字符数
ensure_ascii=False解决乱码
from pyquery import PyQuery import re import json import requests def get_html(url): headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'} try: r = requests.get(url, headers=headers) r.raise_for_status() return r.text except: print('status_code is not 200') return None def parse_time(str): txt = re.search('\d{4}(-\d{2}-\d{2})*', str) return txt.group() def parse_html(html, f): doc = PyQuery(html) dd_nodes = doc('dl.board-wrapper') ranks = dd_nodes('.board-index').items() names = dd_nodes('.name').items() actors = dd_nodes('.star').items() times = dd_nodes('.releasetime').items() integers = dd_nodes('.integer').items() fractions = dd_nodes('.fraction').items() for rank, name, actor, ts, integer, fraction in zip(ranks, names, actors, times, integers, fractions): data = { 'rank': rank.text(), 'name': name.text(), 'actor': actor.text().replace('主演:', ''), 'time': parse_time(ts.text()), 'score': integer.text() + fraction.text() } f.write(json.dumps(data, indent=2, ensure_ascii=False)) if __name__ == '__main__': url = 'http://maoyan.com/board/4' with open('movie_json.txt', 'w') as f: for i in range(10): path = url + '?offset=' + str(i*10) print(path) html = get_html(path) if html: parse_html(html, f)
CSV
import csv with open('data.csv', 'w') as csvfile: writer = csv.writer(csvfile, delimiter=',') # delimiter默认:, writer.writerow(['id', 'name', 'age']) writer.writerow(['1001', 'mike', 20]) writer.writerow(['1002', 'bob', 22]) writer.writerow(['1003', 'jordan', 21]) # writer.writerows([['1001', 'mike', 20], ['1002', 'bob', 22], ['1003', 'jordan', 21]]) #写入多行
CSV存入字典类型数据
import csv with open('data.csv', 'w') as csvfile: fieldnames = ['id', 'name', 'age'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) # fieldnames=fieldnames设置title writer.writeheader() writer.writerow({'id': 1001, 'name': 'mike', 'age': 20}) writer.writerow({'id': 1002, 'name': 'bob', 'age': 22}) writer.writerow({'id': 1003, 'name': 'char', 'age': 24})
# 追加数据 import csv with open('data.csv', 'a', encoding='utf-8') as csvfile: # encoding编码 fieldnames = ['id', 'name', 'age'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) # fieldnames=fieldnames设置title writer.writeheader() writer.writerow({'id': 1001, 'name': '张', 'age': 20}) writer.writerow({'id': 1002, 'name': '李', 'age': 22}) writer.writerow({'id': 1003, 'name': '黄', 'age': 24})
# 读取 import csv with open('data.csv', 'r', encoding='utf-8') as csvfile: # encoding编码 reader = csv.reader(csvfile) for row in reader: print(row)
from pyquery import PyQuery import csv import re import requests def get_html(url): headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'} try: r = requests.get(url, headers=headers) r.raise_for_status() return r.text except: print('status_code is not 200') return None def parse_time(str): txt = re.search('\d{4}(-\d{2}-\d{2})*', str) return txt.group() def parse_html(html): doc = PyQuery(html) dd_nodes = doc('dl.board-wrapper') ranks = dd_nodes('.board-index').items() names = dd_nodes('.name').items() actors = dd_nodes('.star').items() times = dd_nodes('.releasetime').items() integers = dd_nodes('.integer').items() fractions = dd_nodes('.fraction').items() with open('movie.csv', 'a', encoding='utf-8') as csvfile: fieldnames = ['rank', 'name', 'actor', 'time', 'score'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) for rank, name, actor, ts, integer, fraction in zip(ranks, names, actors, times, integers, fractions): data = { 'rank': rank.text(), 'name': name.text(), 'actor': actor.text().replace('主演:', ''), 'time': parse_time(ts.text()), 'score': integer.text() + fraction.text() } writer.writerow(data) if __name__ == '__main__': url = 'http://maoyan.com/board/4' with open('movie.csv', 'w', encoding='utf-8') as csvfile: fieldnames = ['rank', 'name', 'actor', 'time', 'score'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for i in range(10): path = url + '?offset=' + str(i*10) print(path) html = get_html(path) if html: parse_html(html)