利用Python爬取新浪微博营销案例库并下载到本地

 1 from bs4 import BeautifulSoup
 2 import requests,urllib.request,urllib.parse
 3 import json
 4 import time
 5 import os
 6 headers = {'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1',
 7             'Cookie':'SINAGLOBAL=115.171.224.117_1478818430.840585; UOR=www.baidu.com,www.sina.com.cn,; SGUID=1479602257214_22629370; U_TRS1=000000ca.e4817e03.5830f3d9.0954d478; vjuids=8b9ebf053.1588e9bbe9b.0.a7d3c9f0da2d8; lxlrtst=1480138279_o; vjlast=1479861321.1480207111.11; lxlrttp=1480138279; SCF=AvqGheyBOzJit9zuitL3eGB1w7DgNLfZqC_FT1HI_O6vrMhl4NJAJ8QKegO6Qz5961-unIGKeJj59-0w1ioamqc.; Apache=115.171.186.136_1481426939.303674; SUB=_2A251SKFNDeRxGeVM6lIU8izEwjyIHXVWP5WFrDV_PUNbm9ANLXj4kW-ZXh1EJqzVqCfCs2tJhJUwl2nPfA..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhQMFyM94ynlSl9JBZenkS15JpX5KzhUgL.FoeEeK5feozR1K52dJLoI7D8MJLoIEfeKX4G; ALF=1512965277; U_TRS2=00000088.95c945f5.584cd14e.d3ef2984; WEB2_APACHE2_YF=53ce2a867ebeada0edd63e211478fed5; WEB2_APACHE2_JA=4e81a2dfe1afdcedfb634ba45827a3fb; ULV=1481429361019:7:1:1:115.171.186.136_1481426939.303674:1480134833882; appClose=true; NTKF_T2D_CLIENTID=guestAE2E8836-1881-93C9-A9BE-EC1265A9B9B5; nTalk_CACHE_DATA={uid:kf_9378_ISME9754_3210522890,tid:1481429378473190}'}
 8 downloadlinks = []
 9 folder_path = 'D:/'
10 for x in range(46):
11     url = 'http://all.vic.sina.com.cn/weibo_alk/hiddphp.php?page={}&act=jplist_ajax'.format(x)
12     data = requests.get(url,headers = headers)
13     time.sleep(1)
14     data.encoding = 'utf-8'
15     res = data.text
16     lal = json.loads(res)
17     for i in lal:
18         if len(i) < 5:
19             file_url =lal[i]['attachmentArr'][0]['url']
20         else:
21             file_url = i['attachmentArr'][0]['url']
22         downloadlinks.append(file_url)
23 
24 for item in downloadlinks:
25     if item != None:
26         urls = urllib.parse.quote(item,safe='/:?=@$&')
27         time.sleep(1)
28         name = urllib.parse.unquote(item.split('/')[-1])
29         try:
30             urllib.request.urlretrieve(urls,folder_path + name)
31             print(name + '下载成功')
32         except urllib.error.HTTPError:
33             print('页面不存在')
34         except ValueError:
35             print('未知')

 

posted @ 2017-02-10 11:22  Erick-LONG  阅读(531)  评论(0编辑  收藏  举报