爬虫
import requests
from lxml.etree import HTML
import time
import os
from multiprocessing.dummy import Pool
import sys
from fake_useragent import UserAgent
sys.setrecursionlimit(900000)
if not os.path.exists('./tuip'):
os.makedirs('./tuip')
# for v in range(1,10):
# +str(v)+'html
url = 'http://www.yymeitu.com/mntp/'
Cookie = 'UM_distinctid=17838eb9d6971-0030ea59963bf3-353c540f-1fa400-17838eb9d6a658; CNZZDATA1277685156=809437194-1615858682-%7C1615943823'
USER_AGENTS = ["Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",]
a=UserAgent().random
hear={
'user-agent':a,
}
par = {
'Cookie': Cookie,
}
proxies = {
'http': '110.243.0.111:9999',
}
respons = requests.get(url=url, headers=hear).text
print(respons)
etree = HTML(respons)
reb = etree.xpath('/html/body/div[2]/div[8]/ul/li')
# print(reb)
data = []
for f in reb:
title = f.xpath('./a[2]/text()')[0]
title = title.encode('iso-8859-1').decode('utf-8')
print(title)
# ur='http://www.yymeitu.com'+f.xpath('./a[1]/i/@href')[0]
ur = f.xpath('./a[1]/i/img/@src')[0]
print(ur)
dic = {
'title': title,
'url': ur,
}
data.append(dic)
print(data)
# def rep(data):
ur = dic['url']
title = dic['title']
res = requests.get(url=ur, headers=hear).content
fileName = './tuip/' + title + '.jpg'
with open(fileName, 'wb')as f:
f.write(res)
#
#
# pool = Pool(4)
# pool.map(rep, data)
# pool.close()
# pool.join()
#
# def pr():
# proxies=['182.46.123.132:9999','180.118.128.99:9000','182.46.121.253:9999','175.43.33.14:9999','60.207.131.35:80','125.73.220.18:49128',\
# '221.182.31.54:8080','118.24.89.206:1080','118.24.172.149:1080','222.141.244.206:9999','171.35.167.186:9999']
# p=[]
# for k in proxies:
# pro={
# 'http':k,
# }
# p.append(pro)
# if k== '171.35.167.186:9999':
# pr()
# return p
#
# proxies = ['182.46.123.132:9999', '180.118.128.99:9000', '182.46.121.253:9999', '175.43.33.14:9999', '60.207.131.35:80',
# '125.73.220.18:49128', \
# '221.182.31.54:8080', '118.24.89.206:1080', '118.24.172.149:1080', '222.141.244.206:9999',
# '171.35.167.186:9999']
#
# p = []
# proxies={
# 'http':'211.144.213.145:80',
# }
# for k in proxies:
# pro = {
# 'http': k,
# }
# p.append(pro)
# if k == '171.35.167.186:9999':
# print('sadsadsadas')
# #
# data=[]
# for i in proxies:
# time.sleep(1)

浙公网安备 33010602011771号