Python——爬虫思路
爬虫:请求和过滤
编写正则的思路:
1. 找到包裹所有数据的父标签
2. 通过[\s\S]*?跳到需要数据标签开头,写上标签开头作定位开头
3. 用(?P<标签>[\s\S]*?)分组提取该数据
4.写上此标签结尾作定位结尾
5.重复第二步到需要的数据都被分组包裹
<div class="banner_detail_form">[\s\S]*?<img src="(?P<cimgUrl>[\s\S]*?)" alt="[\s\S]*?">[\s\S]*?<h1>(?P<cName>[\s\S]*?)</h1>[\s\S]*?<p class="subtitle">(?P<cAuthor>[\s\S]*?)</p>[\s\S]*?<a href="[\s\S]*?">(?P<cArea>[\s\S]*?)</a>[\s\S]*?<span class="block">[\s\S]*?</span>[\s\S]<span class="block">点击:(?P<cClick>[\s\S]*?)</span>[\s\S]*?<a href="[\s\S]*?" target="[\s\S]*?">(?P<cSort>[\s\S]*?)</a>
完整思路:
找到需要爬取的网站,获取需要数据的父标签。通过regex写出正则思路。在程序里以url,headers,params作为开头写好请求对象来获取文本内容。用re.complie(pattern)生成正则对象,通过pattern.finditer或pattern.search获取大列表内所有匹配对象或第一个匹配对象。通过i.group('标签')获取数据
#爬取的网页 :https://zhwsxx.com/book/26027 # 爬取所有数据信息 # 1.编写正则 # 2.发送请求 url = "https://zhwsxx.com/book/26027" headers = { "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" } params = { } response = requests.get(url,headers=headers,params=params) page_content = response.text # 3.过滤网页 pattern = re.compile('<div class="banner_detail_form">[\s\S]*?<img src="(?P<cimgUrl>[\s\S]*?)" alt="[\s\S]*?">[\s\S]*?<h1>(?P<cName>[\s\S]*?)</h1>[\s\S]*?<p class="subtitle">(?P<cAuthor>[\s\S]*?)</p>[\s\S]*?<a href="[\s\S]*?">(?P<cArea>[\s\S]*?)</a>[\s\S]*?<span class="block">[\s\S]*?</span>[\s\S]<span class="block">点击:(?P<cClick>[\s\S]*?)</span>[\s\S]*?<a href="[\s\S]*?" target="[\s\S]*?">(?P<cSort>[\s\S]*?)</a>') result = pattern.finditer(page_content) for i in result: print(i.groupdict())
#爬取网页:https://movie.douban.com/top250 # 1.编写正则 # 2.发送请求 url = "https://movie.douban.com/top250" headers = { "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" } params = { } response = requests.get(url,headers=headers,params=params) #获取页面源代码 page_content = response.text # 3.过滤数据 obj = re.compile('<li>[\s\S]*?<span class="title">(?P<mName>[\s\S]*?)</span>[\S\s]*?<br>(?P<myear>[\s\S]*?)</p>[\s\S]*?<span>(?P<mRating>[\s\S]*?)</span>') result = obj.finditer(page_content) for i in result: dict = i.groupdict() dict['myear'] = i.group('myear').strip()
# 爬取网站: https://www.jiumanhua.com/ def getCatoonDigui(url,cName,page,domain): c_next_page = re.compile('<div class="control clearfix">[\s\S]*?<div class="item next">[\s\S]*?<a href="(?P<next>[\s\S]*?)">下一话<i>') c_img_pattern = re.compile( '<figure class="item">[\s\S]*?<img class="show-menu lazy" src="[\s\S]*?" data-id="[\s\S]*?" data-original="(?P<img>[\s\S]*?)" style="background') headers = { "user-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" } params = { } response = requests.get(url=url, headers=headers, params=params) child_page_content = response.text iterator = c_img_pattern.finditer(child_page_content) os.mkdir('requests/{0}/{1}'.format(cName,page)) size = 1 for i in iterator: img_url = i.group('img') response = requests.get(img_url,headers=headers,params=params) with open('requests/{0}/{1}/{2}{3}'.format(cName,page,size,os.path.splitext(img_url)[1]),mode="wb") as fd_write: fd_write.write(response.content) size += 1 time.sleep(0.2) # 下载完此页面 c_next = c_next_page.search(child_page_content) if c_next != None: page += 1 c_next_url = domain+c_next.group('next') getCatoonDigui(c_next_url,cName,page,domain) def getCartoon(domain,url): c_title_pattern = re.compile( '<div class="container">[\s\S]*?<div class="title">(?P<cName>[\s\S]*?)</div>[\s\S]*?<a href="javascript:void[(]0[)];">(?P<cSort>[\s\S]*?)</a>') c_button_pattern = re.compile( '<div class="inner">[\s\S]*?<a href="(?P<url>[\s\S]*?)" class="btn" title="[\s\S]*?">开始阅读</a>') headers = { "user-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" } params = { } response = requests.get(url=url, headers=headers, params=params) page_content = response.text response.close() c_title = c_title_pattern.search(page_content) cName = c_title.group('cName') os.mkdir('requests/{0}'.format(cName)) url = domain + c_button_pattern.search(page_content).group("url") getCatoonDigui(url=url,cName=cName,page=1,domain=domain) print("over") url = "https://www.jiumanhua.com/comics/6601" domain = "https://www.jiumanhua.com/" getCartoon(domain,url)