# -*- coding: utf-8 -*-
# __title__ = '爬取途虎车主文档标题链接并写入excel.py'
# __author__ = 'yangyang'
# __mtime__ = '2018.03.22'
'''
爬取网站视频
视频地址:http://www.budejie.com/video/
r:正则转义
re.S 匹配换行符
'''
import requests,re
import time,os
import xlrd,xlwt
from concurrent.futures import ProcessPoolExecutor,ThreadPoolExecutor
# 获取网页,
def get_page(url):
print('<%s> is getting [%s]' %(os.getpid(),url))
response=requests.get(url)
response_contents = response.text
re_content = re.compile(r'<div class="title">(.*?)</a></div>',re.S)
url_content = re.findall(re_content,response_contents) #list
article_dict = {}
for content in url_content:
re_url = re.compile(r'/(.*?\.[a-z]{4})', re.S) #匹配网页链接
re_title = re.compile(r'[\u4e00-\u9fa5]', re.S) #匹配标题
sub_url = re.findall(re_url, content) #查找链接
title_content = re.findall(re_title, content) #查找标题
title = ''.join(title_content)
url = 'https://www.tuhu.cn/%s' % (sub_url[0])
article_dict[title] = url
return article_dict
# # 解析网页
def parse_page(res):
global content_dic
res = res.result()
content_dic.update(res)
def write_excel(res):
f = xlwt.Workbook(encoding='utf-8')
sheet1 = f.add_sheet(u'车主问答', cell_overwrite_ok=True)
row0 = [u'标题', u'链接']
for i in range(0, len(row0)):
sheet1.write(0, i, row0[i])
for ind,key in enumerate(res,1):
sheet1.write(ind,0,key) # sheet1.write(行数,列,value)
sheet1.write(ind,1,res[key])
f.save('车主问答.xls')
if __name__ == '__main__':
urls = []
content_dic = {}
for i in range(1,101):
url = 'https://www.tuhu.cn/Community/Discovery.aspx?tagId=1&pageIndex=%s'%i
urls.append(url)
pool = ThreadPoolExecutor()
for url_a in urls:
pool.submit(get_page,url_a).add_done_callback(parse_page)
pool.shutdown()
write_excel(content_dic)