民事案例爬取1

爬取网页:中国法院网 https://www.chinacourt.org/index.shtml

代码如下:(数据连接部分自行修改)

import pymysql
import requests
import csv
from bs4 import BeautifulSoup
pnum = 7
while pnum <=50:
    news_list = []
    head = ['新闻标题']
    url = 'https://www.chinacourt.org/article/index/id/MzAwNDAwMgCRhAEA/page/'+str(pnum)+'.shtml'
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
    res = requests.get(url,headers=headers)
    #本来demo这一段是没有的,但是常规解码跑出来是乱码,这里用了暴力解码
    demo = res.text.encode("utf-8").decode("utf-8")
    bs = BeautifulSoup(demo,'html.parser')
    # 首先爬取这个板块的两个头条
    # 为了方便查看,设置了爬取计数变量a和b
    a = 1
    econ = bs.find('div', class_='list').find_all('li')
    #print(econ)
    for info in econ:
        print('正在爬取第{}条案例'.format(a))
        news_title = info.find('span', class_='left').find('a')['title']
        print(news_title)
        news_url = info.find('span', class_='left').find('a')['href']
        print(news_url)
        news_time = info.find('span', class_='right').text
        print(news_time)

        res2 = requests.get('https://www.chinacourt.org/'+news_url, headers=headers)
        # 这里也用了暴力解码
        demo = res2.text.encode("utf-8").decode("utf-8")
        bs2 = BeautifulSoup(demo, 'html.parser')

        paper = bs2.find('div', class_='detail_txt').find_all('p')[0].text+bs2.find('div', class_='detail_txt').find_all('p')[1].text
        paper = paper.replace(u'\u3000', u'')


        # 1.连接数据库
        conn = pymysql.connect(
            host='localhost',
            user='root',
            password='lin0613',  # 密码
            db='cus',  # 数据库名
            charset='utf8',
        )
        # 2.创建游标对象
        cur = conn.cursor()
        # 3.对数据库进行CRUD操作
        #
        try:
            insert_sqli = "insert into case1 values('"+news_time+"','"+news_title+"','"+paper+"');"
            cur.execute(insert_sqli)
        except Exception as e:
            print("插入数据失败:", e)
        else:
            conn.commit()
            print("插入数据成功;")
        # 4. 关闭游标
        cur.close()
        # 5. 关闭连接
        conn.close()

        dict_news = {'案例标题': news_title, '时间': news_time, '主要内容':paper}
        #print(dict_news)
        news_list.append(dict_news)
        a += 1
    pnum = pnum +1

 

posted @ 2022-05-30 16:04  往心。  阅读(49)  评论(0编辑  收藏  举报