# coding=utf-8
#@auther:Mana_菜小刀
import requests
import queue
import threading
import xlrd
import xlwt
from lxml import etree
from xlutils.copy import copy
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
}
myxls = xlwt.Workbook()
sheet1 = myxls.add_sheet('收录search')
lst_name = ['url', '收录/未收录', '图片']
for i in range(len(lst_name)):
sheet1.write(0, i, lst_name[i])
myxls.save('result.xls')
def log(*args,**kwargs):
print(*args,**kwargs)
class baiduSpider(threading.Thread):
def __init__(self, queue_li, name):
threading.Thread.__init__(self)
self._queue = queue_li
self._name = name
def run(self):
while not self._queue.empty():
url = self._queue.get()
try:
self.get_url(url)
except Exception as e:
log(e)
pass
def get_url(self,url):
requests.adapters.DEFAULT_RETRIES = 5
r = requests.session()
r.keep_alive = False
s = r.get(url=url, headers=headers)
#log(s)
xpather = etree.HTML(s.text)
strs = xpather.xpath('//span[@class="nums_text"]//text()')
imgs = xpather.xpath('//img[@class="c-img c-img6"]/@src')
#log(strs, imgs)
search_mo = ['收录','未收录']
img_mo = ['有图','无图']
url_mo = url.replace('http://www.baidu.com/s?wd=','')
workbook = xlrd.open_workbook('result.xls', formatting_info=True)
sheet = workbook.sheet_by_index(0)
rowNum = sheet.nrows
colNum = sheet.ncols
newbook = copy(workbook)
newsheet = newbook.get_sheet(0)
if strs[0] != "百度为您找到相关结果约0个" and len(imgs) > 0:
newsheet.write(rowNum,0,url_mo)
newsheet.write(rowNum, 1, search_mo[0])
newsheet.write(rowNum, 2, img_mo[0])
log(search_mo[0],'丨',img_mo[0],'丨',url_mo)
#newbook.save('result.xls')
elif strs[0] != "百度为您找到相关结果约0个" and len(imgs) == 0:
newsheet.write(rowNum, 0, url_mo)
newsheet.write(rowNum, 1, search_mo[0])
newsheet.write(rowNum, 2, img_mo[1])
log(search_mo[0],'丨',img_mo[1],'丨',url_mo)
#newbook.save('result.xls')
else:
newsheet.write(rowNum, 0, url_mo)
newsheet.write(rowNum, 1, search_mo[1])
newsheet.write(rowNum, 2, img_mo[1])
log(search_mo[1],'丨',img_mo[1],'丨',url_mo)
newbook.save('result.xls')
def main():
queue_li = queue.Queue()
threads = []
thread_count = 10
myxls = xlwt.Workbook()
sheet1 = myxls.add_sheet('IDF')
'''把'urls'改成自己的txt文档名称:'''
with open('urls', 'r', encoding='utf-8', errors="ignore") as f:
content = f.read()
urls = content.split('\n')
for url in urls:
if len(url) > 0:
url_search = url
queue_li.put('http://www.baidu.com/s?wd={}'.format(url_search))
for i in range(thread_count):
spider = baiduSpider(queue_li, url_search)
threads.append(spider)
for i in threads:
i.start()
for i in threads:
i.join()
'''log("Mana好伟大!(^-^)V")'''
if __name__ == '__main__':
log("Mana好伟大!(^-^)V")
main()