1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # @Date : 2017-08-29 18:38:23
4 # @Author : EnderZhou (zptxwd@gmail.com)
5 # @Link : http://www.cnblogs.com/enderzhou/
6 # @Version : $Id$
7
8 import requests
9 import sys
10 from Queue import Queue
11 import threading
12 from bs4 import BeautifulSoup as bs
13 import re
14
15 # 默认爬取百度76页搜索结果url,调用格式 Python.exe 本文件名称.py 搜索关键字,如关键字含特殊符号使用引号包含起来。
16 # 爬取结果有txt文档输出。目前尚未能过来百度推广链接,后续有可能会完善。另外后续将会添加同一网站相同路径不通参数url的过滤。
17 # https://www.baidu.com/s?wd=ichunqiu&pn=10
18 # wd参数为搜索内容关键字 pn参数控制页码 第二页为10 每页新增10 最大页数参数为750即76页。
19
20 headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36',}
21
22 class BaiduSpider(threading.Thread):
23 def __init__(self,queue):
24 threading.Thread.__init__(self)
25 self._queue = queue
26
27 def run(self):
28 while not self._queue.empty():
29 url = self._queue.get()
30 try:
31 self.spider(url)
32 except Exception as e:
33 # print e
34 pass
35
36 def spider(self,url):
37 r = requests.get(url=url,headers=headers)
38 soup = bs(r.content,'html.parser')
39 urllist = soup.find_all(name='a',attrs={'data-click':re.compile(('.')),'class':None,'data-is-main-url':None})
40 for i in urllist:
41 l = requests.get(url=i['href'],headers=headers)
42 if l.status_code == 200:
43 ll = l.url.split('/')
44 lll = ll[0]+'//'+ll[2]+'\n'
45 #可根据需求修改是否显示主域名
46 sys.stdout.write(lll+l.url+'\n')
47 f1 = open('out_para.txt','a+')
48 f1.write(l.url+'\n')
49 f1.close()
50 with open('out_index.txt') as f:
51 if lll not in f.read():
52 f2 = open('out_index.txt','a+')
53 f2.write(lll)
54 f2.close()
55
56 def main(keyword):
57 queue = Queue()
58 for i in range(0,760,10):
59 l = 'https://www.baidu.com/s?wd='+keyword+'&pn='+str(i)
60 # print l
61 queue.put(l)
62 threads = []
63 thread_count = 5
64 for i in range(thread_count):
65 threads.append(BaiduSpider(queue))
66 for t in threads:
67 t.start()
68 for t in threads:
69 t.join()
70
71 if __name__ == '__main__':
72 if len(sys.argv) != 2:
73 print 'Enter:python %s keyword' % sys.argv[0]
74 sys.exit(-1)
75 else:
76 f1 = open('out_para.txt','w')
77 f1.close()
78 f2 = open('out_index.txt','w')
79 f2.close()
80 main(sys.argv[1])