Python——爬虫思路

 

爬虫:请求和过滤

 

编写正则的思路: 

1. 找到包裹所有数据的父标签

2. 通过[\s\S]*?跳到需要数据标签开头,写上标签开头作定位开头

3. 用(?P<标签>[\s\S]*?)分组提取该数据

4.写上此标签结尾作定位结尾

5.重复第二步到需要的数据都被分组包裹

<div class="banner_detail_form">[\s\S]*?<img src="(?P<cimgUrl>[\s\S]*?)" alt="[\s\S]*?">[\s\S]*?<h1>(?P<cName>[\s\S]*?)</h1>[\s\S]*?<p class="subtitle">(?P<cAuthor>[\s\S]*?)</p>[\s\S]*?<a href="[\s\S]*?">(?P<cArea>[\s\S]*?)</a>[\s\S]*?<span class="block">[\s\S]*?</span>[\s\S]<span class="block">点击:(?P<cClick>[\s\S]*?)</span>[\s\S]*?<a href="[\s\S]*?" target="[\s\S]*?">(?P<cSort>[\s\S]*?)</a>

 

 

 

完整思路:

  找到需要爬取的网站,获取需要数据的父标签。通过regex写出正则思路。在程序里以url,headers,params作为开头写好请求对象来获取文本内容。用re.complie(pattern)生成正则对象,通过pattern.finditer或pattern.search获取大列表内所有匹配对象或第一个匹配对象。通过i.group('标签')获取数据

 

#爬取的网页 :https://zhwsxx.com/book/26027
# 爬取所有数据信息

# 1.编写正则

# 2.发送请求
url = "https://zhwsxx.com/book/26027"
headers = {
    "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
}
params = {

}
response = requests.get(url,headers=headers,params=params)
page_content = response.text

# 3.过滤网页
pattern = re.compile('<div class="banner_detail_form">[\s\S]*?<img src="(?P<cimgUrl>[\s\S]*?)" alt="[\s\S]*?">[\s\S]*?<h1>(?P<cName>[\s\S]*?)</h1>[\s\S]*?<p class="subtitle">(?P<cAuthor>[\s\S]*?)</p>[\s\S]*?<a href="[\s\S]*?">(?P<cArea>[\s\S]*?)</a>[\s\S]*?<span class="block">[\s\S]*?</span>[\s\S]<span class="block">点击:(?P<cClick>[\s\S]*?)</span>[\s\S]*?<a href="[\s\S]*?" target="[\s\S]*?">(?P<cSort>[\s\S]*?)</a>')
result = pattern.finditer(page_content)
for i in result:
    print(i.groupdict())

 

#爬取网页:https://movie.douban.com/top250

# 1.编写正则

# 2.发送请求
url = "https://movie.douban.com/top250"
headers = {
    "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
}
params = {

}
response = requests.get(url,headers=headers,params=params)
#获取页面源代码
page_content = response.text

# 3.过滤数据
obj = re.compile('<li>[\s\S]*?<span class="title">(?P<mName>[\s\S]*?)</span>[\S\s]*?<br>(?P<myear>[\s\S]*?)</p>[\s\S]*?<span>(?P<mRating>[\s\S]*?)</span>')
result = obj.finditer(page_content)

for i in result:
    dict = i.groupdict()
    dict['myear'] = i.group('myear').strip()

 

# 爬取网站: https://www.jiumanhua.com/

def getCatoonDigui(url,cName,page,domain):
    c_next_page = re.compile('<div class="control clearfix">[\s\S]*?<div class="item next">[\s\S]*?<a href="(?P<next>[\s\S]*?)">下一话<i>')
    c_img_pattern = re.compile(
        '<figure class="item">[\s\S]*?<img class="show-menu lazy" src="[\s\S]*?" data-id="[\s\S]*?" data-original="(?P<img>[\s\S]*?)" style="background')
    headers = {
        "user-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
    }
    params = {

    }
    response = requests.get(url=url, headers=headers, params=params)
    child_page_content = response.text
    iterator = c_img_pattern.finditer(child_page_content)
    os.mkdir('requests/{0}/{1}'.format(cName,page))
    size = 1
    for i in iterator:
        img_url = i.group('img')
        response = requests.get(img_url,headers=headers,params=params)
        with open('requests/{0}/{1}/{2}{3}'.format(cName,page,size,os.path.splitext(img_url)[1]),mode="wb") as fd_write:
            fd_write.write(response.content)
            size += 1
            time.sleep(0.2)

    # 下载完此页面
    c_next = c_next_page.search(child_page_content)
    if c_next != None:
        page += 1
        c_next_url = domain+c_next.group('next')
        getCatoonDigui(c_next_url,cName,page,domain)

def getCartoon(domain,url):
    c_title_pattern = re.compile(
        '<div class="container">[\s\S]*?<div class="title">(?P<cName>[\s\S]*?)</div>[\s\S]*?<a href="javascript:void[(]0[)];">(?P<cSort>[\s\S]*?)</a>')
    c_button_pattern = re.compile(
        '<div class="inner">[\s\S]*?<a href="(?P<url>[\s\S]*?)" class="btn" title="[\s\S]*?">开始阅读</a>')
    headers = {
        "user-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
    }
    params = {

    }
    response = requests.get(url=url, headers=headers, params=params)
    page_content = response.text
    response.close()
    c_title = c_title_pattern.search(page_content)
    cName = c_title.group('cName')
    os.mkdir('requests/{0}'.format(cName))
    url = domain + c_button_pattern.search(page_content).group("url")
    getCatoonDigui(url=url,cName=cName,page=1,domain=domain)
    print("over")

url = "https://www.jiumanhua.com/comics/6601"
domain = "https://www.jiumanhua.com/"
getCartoon(domain,url)

 

posted @ 2021-10-08 22:06  remix_alone  阅读(481)  评论(0)    收藏  举报