爬虫基本概念:
按照一定的规则,自动地抓取万维网信息的程序或者脚本
robots.txt各个网站对外网的爬虫规则
爬虫类库的使用:
urllib urllib2 urllib4
requests 请求地址的库
BeautifulSoup--》bs4
安装request, bs4, lxml
import requests from bs4 import BeautifulSoup import re import time #延迟
def main(): url="http://search.dangdang.com/" headers={ "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36" }
for i in range(2,4): params = { "key": 'c#', "act": 'input', "page_index": i # 参数Referer: http://search.dangdang.com/?key=python&act=input
} rs = requests.get(url=url, params=params, headers=headers) # 参数 = 前面定义变量 soup = BeautifulSoup(rs.text, 'lxml') # data=soup.find_all('a') data = soup(title=re.compile('C#')) for item in data: temp = { # "title":item.get_text(), "title": item.get("title"), "link": item.get('href') } print(temp) time.sleep(30)
if __name__ == '__main__': main()
|