第一个: 静态页面类爬取猫眼电影 TOP 100 , 应用beautifulsoup + requests
def getHtml(url): try: r = requests.get(url, timeout=30) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return '' def getContent(html, info_list): soup = BeautifulSoup(html, 'html.parser') # print(soup.prettify()) # print(soup.find('dl', attrs={'class': 'board-wrapper'}).children) items = soup.find('dl', attrs={'class': 'board-wrapper'}).children for item in items: if isinstance(item, bs4.element.Tag): title = item.find('p', attrs={'class': 'name'}).string star = item.find('p', attrs={'class': 'star'}).string.strip() releasetime = item.find('p', attrs={'class': 'releasetime'}).string score = item.find('i', attrs={'class': 'integer'}).string + item.find('i', attrs={'class': 'fraction'}).string info_dict = { 'title': title, 'star': star, 'releasetime': releasetime, 'score': score, } info_list.append(info_dict) return info_list def saveFile(info_list): with open('/Users/macmini-2/Desktop/GitDemo/DailyFresh/Day_fresh/Fresh/static/jsonFile.json', 'w') as f : f.write(json.dumps(info_list)) pass def main(): info_list = [] depth = 1 start_url = 'https://maoyan.com/board/4' for i in range(depth): url = start_url + '?offset=' + str(i*10) html = getHtml(url) content_json = getContent(html, info_list) saveFile(content_json) i = i+1 main()
第二个: 今日头条 (图集爬取) import requests import json def getHtml(url): try: headers = {'User-Agent': 'MOzilla/5.0'} r = requests.get(url, timeout=30, headers=headers) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return '' def next(max_behot_time, data_list): url = 'https://www.toutiao.com/api/pc/feed/?category=gallery_old_picture&utm_source=toutiao&max_behot_time=' + str(max_behot_time) json_dict = json.loads(getHtml(url)) data = json_dict['data'] for i in data: data_list.append(i) return data_list if __name__ == '__main__': data_list = [] start_url = 'https://www.toutiao.com/api/pc/feed/?category=gallery_old_picture&utm_source=toutiao&max_behot_time=0' text = getHtml(start_url) json_dict = json.loads(text) max_behot_time = json_dict['next']['max_behot_time'] data_list = json_dict['data'] data_list2 = next(max_behot_time, data_list) print(data_list2)