第一个: 静态页面类爬取猫眼电影 TOP 100 , 应用beautifulsoup + requests

def getHtml(url): try: r = requests.get(url, timeout=30) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return '' def getContent(html, info_list): soup = BeautifulSoup(html, 'html.parser') # print(soup.prettify()) # print(soup.find('dl', attrs={'class': 'board-wrapper'}).children) items = soup.find('dl', attrs={'class': 'board-wrapper'}).children for item in items: if isinstance(item, bs4.element.Tag): title = item.find('p', attrs={'class': 'name'}).string star = item.find('p', attrs={'class': 'star'}).string.strip() releasetime = item.find('p', attrs={'class': 'releasetime'}).string score = item.find('i', attrs={'class': 'integer'}).string + item.find('i', attrs={'class': 'fraction'}).string info_dict = { 'title': title, 'star': star, 'releasetime': releasetime, 'score': score, } info_list.append(info_dict) return info_list def saveFile(info_list): with open('/Users/macmini-2/Desktop/GitDemo/DailyFresh/Day_fresh/Fresh/static/jsonFile.json', 'w') as f : f.write(json.dumps(info_list)) pass def main(): info_list = [] depth = 1 start_url = 'https://maoyan.com/board/4' for i in range(depth): url = start_url + '?offset=' + str(i*10) html = getHtml(url) content_json = getContent(html, info_list) saveFile(content_json) i = i+1 main()

 

第二个: 今日头条  (图集爬取)
import requests
import json

def getHtml(url):
    try:
        headers = {'User-Agent': 'MOzilla/5.0'}
        r = requests.get(url, timeout=30, headers=headers)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ''

def next(max_behot_time, data_list):
    url = 'https://www.toutiao.com/api/pc/feed/?category=gallery_old_picture&utm_source=toutiao&max_behot_time=' + str(max_behot_time)
    json_dict = json.loads(getHtml(url))
    data = json_dict['data']
    for i in data:
        data_list.append(i)
    return data_list


if __name__ == '__main__':
    data_list = []
    start_url = 'https://www.toutiao.com/api/pc/feed/?category=gallery_old_picture&utm_source=toutiao&max_behot_time=0'
    text = getHtml(start_url)
    json_dict = json.loads(text)
    max_behot_time = json_dict['next']['max_behot_time']
    data_list = json_dict['data']
    data_list2 = next(max_behot_time, data_list)
    print(data_list2)

 

posted on 2019-07-24 15:38  wy0925  阅读(157)  评论(0编辑  收藏  举报