#coding=utf-8
import os
from time import sleep
import requests
import re
from bs4 import BeautifulSoup
headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Cookie': '_iuqxldmzr_=32; _ntes_nnid=0e6e1606eb78758c48c3fc823c6c57dd,1527314455632; '
'_ntes_nuid=0e6e1606eb78758c48c3fc823c6c57dd; __utmc=94650624; __utmz=94650624.1527314456.1.1.'
'utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); WM_TID=blBrSVohtue8%2B6VgDkxOkJ2G0VyAgyOY;'
' JSESSIONID-WYYY=Du06y%5Csx0ddxxx8n6G6Dwk97Dhy2vuMzYDhQY8D%2BmW3vlbshKsMRxS%2BJYEnvCCh%5CKY'
'x2hJ5xhmAy8W%5CT%2BKqwjWnTDaOzhlQj19AuJwMttOIh5T%5C05uByqO%2FWM%2F1ZS9sqjslE2AC8YD7h7Tt0Shufi'
'2d077U9tlBepCx048eEImRkXDkr%3A1527321477141; __utma=94650624.1687343966.1527314456.1527314456'
'.1527319890.2; __utmb=94650624.3.10.1527319890',
'Host': 'music.163.com',
'Referer': 'http://music.163.com/',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/66.0.3359.181 Safari/537.36'}
def get_img_list(url):
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'lxml')
img_list = soup.find_all('img', class_='ui image lazy')
return img_list
def validateTitle(title):
rstr = r"[\/\\\:\*\?\"\<\>\|\n]" # '/ \ : * ? " < > |'
new_title = re.sub(rstr, "_", title) # 替换为下划线
new_title = new_title[0:20]
return new_title
try:
path = "d:/crawl1/"
#_url = 'https://fabiaoqing.com/biaoqing/lists/page/{page}.html'
_url = 'https://fabiaoqing.com/tag/detail/id/{page}.html'
urls = [_url.format(page=page) for page in range(1, 54673+1)]
for real_url in urls:
# https: // fabiaoqing.com / tag / detail / id / 2 / page / 227.
# html
# https: // fabiaoqing.com / tag / detail / id / 2.
# html
tag_id = real_url.split('/')[-1].split('.')[-2];
for i in range(1,300):
if i != 1:
child_url = "https://fabiaoqing.com/tag/detail/id/"+tag_id+"/page/"+str(i)+".html"
else :
child_url = "https://fabiaoqing.com/tag/detail/id/"+tag_id+".html"
print('crawl url ' + child_url)
img_list = get_img_list(child_url)
for img in img_list:
try:
image = img.get('data-original')
pattern = re.compile(r'http://wxl.sinaimg.cn.*')
# 使用Pattern匹配文本,获得匹配结果,无法匹配时将返回None
title = img.get('title')
title = validateTitle(title);
with open(path + title + os.path.splitext(image)[-1], 'wb') as f:
img = requests.get(image).content
f.write(img)
except Exception as e:
print(str(e))
except Exception as e:
print(str(e))