python爬虫爬取指定内容

爬取一些网站下指定的内容,一般来说可以用xpath来直接从网页上来获取,但是当我们获取的内容不唯一的时候我们无法选择,我们所需要的、所指定的内容。

解决办法:
可以使用for In 语句来判断
如果我们所指定的内容在这段语句中我们就把这段内容爬取下来,反之就丢弃

实列代码如下:(以我们学校为例)

import urllib.request
from lxml import etree


def creat_url(page):
    if(page==1):
        url='https://www.qjnu.edu.cn/channels/9260.html'
    else:
        url='https://www.qjnu.edu.cn/channels/9260_'+str(page)+'.html'

    headers={
        'User-Agent':' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.53'
    }

    request = urllib.request.Request(url=url,headers=headers)

    return request

def creat_respons(request):
    respons = urllib.request.urlopen(request)
    content = respons.read().decode('utf-8')
    return content

def down_2(url):
    url = url
    headers = {
        'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36 Edg/100.0.1185.29'
    }
    request = urllib.request.Request(url=url, headers=headers)
    response = urllib.request.urlopen(request)
    content2 = response.read().decode('utf-8')
    tree2 = etree.HTML(content2)
    return tree2

def down_loads(content):
    tree = etree.HTML(content)
    name_list = tree.xpath('//div[@class="media"]/h4/a/text()')
    url_list = tree.xpath('//div[@class="media"]/h4/a/@href')
    for i in range(len(name_list)):
        if key in name_list[i]:
            with open('学校党员主题网址.txt', 'a', encoding='UTF-8') as fp:
                fp.write(url_list[i]+'\n')

            url = url_list[i]
            tree = down_2(url)
            tex_list = tree.xpath('//div[@class="field-item even"]//p/span/text()')
            name = name_list[i]
            with open(name + '.txt', 'w', encoding='UTF-8') as fp:
                fp.write(str(tex_list))





if __name__ == '__main__':
    all_page=int(input('请输入要爬取页码:'))
    key = str(input('请输入关键词:'))
    s_page=1
    for page in range(s_page,all_page+1):
        request=creat_url(page)
        content=creat_respons(request)
        down_loads(content)

此段代码的可执行性没有问题,逻辑上也能够串通
但是代码冗余较多,看起来有点复杂,现在正在研究简化版的代码!

新版本代码运用了requests库
结构上也做出了优化
代码如下:

import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor

def base_url(i,url):
    if (url==[]):
        if (i == 1):
            url = 'https://www.qjnu.edu.cn/channels/9260.html'
        else:
            url='https://www.qjnu.edu.cn/channels/9260_'+str(i)+'.html'
    request = requests.get(url)
    request.encoding = 'utf-8'
    html = etree.HTML(request.text)
    return html

def down_1(html):
    name_list = html.xpath('//div[@class="media"]/h4/a/text()')
    url_list = html.xpath('//div[@class="media"]/h4/a/@href')
    for i in range(len(name_list)):
        if key in name_list[i]:
            with open('学校党员主题网址.txt', 'a', encoding='UTF-8') as fp:
                fp.write(url_list[i] + '\n')
            url = url_list[i]
            html2 = base_url(i,url)
            tex_list = html2.xpath('//div[@class="field-item even"]//p/span/text()')
            with open(name_list[i] + '.txt', 'w', encoding='UTF-8') as fp:
                fp.write(str(tex_list))

if __name__ == '__main__':
    page = int(input('请输入需要爬取的页数:'))
    key = str(input('请输入要查找的关键词:'))
    for i in range(1,page+1):
        url = []
        html = base_url(i,url)
        down_1(html)
posted @ 2022-06-14 08:56  皓_月  阅读(1135)  评论(0)    收藏  举报