【菜鸟学Python】爬取糗事百科分别使用面向过程面向对象

方法一:使用面向过程爬取1.0
import time
import json
import requests
from lxml import etree
from requests.exceptions import RequestException


url = "http://www.lovehhy.net/joke/Detail/QSBK"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
}

response = requests.get(url, headers=headers)
# print(response.text)

dom = etree.HTML(response.text)
titles = dom.xpath('//div[@class="cat_llb"]/h3')
contents = dom.xpath('//div[@class="cat_llb"]/div[@id="endtext"]')
times = dom.xpath('//div[@class="cat_llb"]')
# print(times)

title_list = []
for title in titles:
    title_list.append(title.xpath('./a/text()')[0])


content_list = []
for content in contents:
    content_list.append(content.xpath('.//text()')[0])

time_list = []
for time in times:
    for i in time.xpath('./text()'):
        print(len(i))
        time_list.append(i)


zip_item = zip(title_list, content_list, time_list)
with open('content.json', 'w', encoding='utf-8') as obj_f:
    for i in zip_item:
        item = {}
        item['title'] = i[0]
        item['content'] = i[1]
        item['time'] = i[2][0:22]
        item['click'] = i[2][22:-1]
        obj_f.write(json.dumps(item, ensure_ascii=False) + ',\n')
方法二:使用面向过程爬取2.0(使用函数封装爬取)
import time
import json
import requests
from lxml import etree
from requests.exceptions import RequestException


def get_one_page(url):
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            print(response.url)
            return response.text
        return None
    except RequestException:
        return None


def parse_one_page(html):
    dom = etree.HTML(html)
    title_list = []
    for title in dom.xpath('//div[@class="cat_llb"]/h3'):
        title_list.append(title.xpath('./a/text()')[0])

    content_list = []
    for content in dom.xpath('//div[@class="cat_llb"]/div[@id="endtext"]'):
        content_list.append(content.xpath('.//text()')[0])

    time_list = []
    for time in dom.xpath('//div[@class="cat_llb"]'):
        for i in time.xpath('./text()'):
            time_list.append(i)

    zip_item = zip(title_list, content_list, time_list)
    for i in zip_item:
        item = {}
        item['title'] = i[0]
        item['content'] = i[1].strip()
        item['time'] = i[2][0:22].strip()
        item['click'] = i[2][22:-1].strip()
        print(item)
        yield item


def write_to_file(content):
    with open('result.json', 'a', encoding='utf-8') as f:
        print(type(json.dumps(content)))
        f.write(json.dumps(content, ensure_ascii=False) + '\n')


def main(start):
    url = "http://www.lovehhy.net/joke/Detail/QSBK/" + str(start)
    html = get_one_page(url)
    for item in parse_one_page(html):
        write_to_file(item)


if __name__ == "__main__":
    for i in range(0, 5):
        main(start=i)
        time.sleep(1)

方法三:使用面向对象爬取

import time
import json
import requests
from lxml import etree
from requests.exceptions import RequestException


class ChouShiBaiKe:

    def __init__(self):

        self.url = "http://www.lovehhy.net/joke/Detail/QSBK/"
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
        }

    def get_one_page(self, url):
        try:
            response = requests.get(url, headers=self.headers)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            return None

    def process_data(self, html):

        dom = etree.HTML(html)
        titles = dom.xpath('//div[@class="cat_llb"]/h3')
        contents = dom.xpath('//div[@class="cat_llb"]/div[@id="endtext"]')
        times = dom.xpath('//div[@class="cat_llb"]')
        title_list = []
        for title in titles:
            title_list.append(title.xpath('./a/text()')[0])

        content_list = []
        for content in contents:
            content_list.append(content.xpath('.//text()')[0])

        time_list = []

        for time in times:
            for i in time.xpath('./text()'):
                time_list.append(i)

        zip_item = zip(title_list, content_list, time_list)

        for i in zip_item:
            item = {}
            item['title'] = i[0]
            item['content'] = i[1].strip()
            item['time'] = i[2][0:22].strip()
            item['click'] = i[2][22:-1].strip()
            print(item)
            yield item

    def save_file(self, content):
        with open('result_class.json', 'a', encoding='utf-8') as f:
            # print(type(json.dumps(content)))
            f.write(json.dumps(content, ensure_ascii=False) + '\n')

    def run(self, start):
        url = "http://www.lovehhy.net/joke/Detail/QSBK/" + str(start)
        html = self.get_one_page(url)
        for item in self.process_data(html):
            self.save_file(item)


if __name__ == "__main__":
    qsbk = ChouShiBaiKe()
    for i in range(0, 5):
        qsbk.run(start=i)
        time.sleep(1)

 



posted @ 2019-10-16 18:02  Cloud.Li  阅读(186)  评论(0)    收藏  举报