爬取民法案例

from bs4 import BeautifulSoup
import time
import re
import datetime
import requests
import csv
import pandas as pd

headers = {
    'authority': 'item.jd.com',
    'cache-control': 'max-age=0',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="90", "Google Chrome";v="90"',
    'sec-ch-ua-mobile': '?0',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 FS',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'sec-fetch-site': 'same-site',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-user': '?1',
    'sec-fetch-dest': 'document',
    'referer': 'https://search.jd.com/',
    'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,ja;q=0.6',
    'cookie': '__jdu=16196590092911498836650; shshshfpa=a828a369-a728-e182-048e-b5072447be6e-1619659011; shshshfpb=tOCI1z%20PGPaR3CEfad3X5Hg%3D%3D; pinId=ra4DldyvyoY_wKdMfg-NzQ; pin=jd_vbaqeJhuCzNB; unick=jd_vbaqeJhuCzNB; _tp=nTy%2BrzzrxqSaQB0srQMkNg%3D%3D; _pst=jd_vbaqeJhuCzNB; cn=124; ipLocation=%u6cb3%u5317; areaId=5; ipLoc-djd=5-248-2990-51290; __jdv=76161171|baidu|-|organic|not set|1624541667990; PCSYCityID=CN_130000_130100_130104; user-key=b71f1c66-2098-455c-a8d6-18761218a06f; TrackID=1GQVBVkoDl_4bfjMShj3z5E2QZTbfh56Eyq2y7QsR13GdfYydXHBeQpKI2oGOZnJNbiyyJqt5Qr69BhDridyaSEtCAvnCqeOxoZA1-uw_7L7c5mYqqvF-HNhckGxBUD_2; shshshfp=4e3578329e37ee251bff8051f7e59ea5; __jda=122270672.16196590092911498836650.1619659009.1624285397.1624541668.33; __jdc=122270672; shshshsID=204213ee80eea633cacd5e34507f5018_29_1624545894631; __jdb=122270672.61.16196590092911498836650|33.1624541668; 3AB9D23F7A4B3C9B=Z5VP7UVC6XSUTN77RWVU3MCMKNU3B47D7TIAVQK2GSP2JKY22AOOH4FHZMIUEZFND67AAM6RO3OUTVSPKHBUXGEYHI',
    'if-modified-since': 'Thu, 24 Jun 2021 14:44:50 GMT',
}

def pa_menu():
    url = "https://anli.lawtime.cn/minfa/"
    response = requests.get(url, headers=headers).text
    # print(response)
    soup = BeautifulSoup(response, 'html.parser')
    # print(soup)
    i = 10001
    lists = []
    for item in soup.find_all(class_="db list-nav-a subItem active"):
        item = item.string
        item = item[:-2]
        print(item)
        l = [i, item, "民法"]
        lists.append(l)
        i = i + 1
    for item in soup.find_all(class_="db list-nav-a subItem"):
        item = item.string
        item = item[:-2]
        # print(item)
        l = [i, item, "民法"]
        lists.append(l)
        i = i + 1
        # print(l)
    # print(lists)
    with open("data/menu.csv", "w", encoding="utf-8", newline="") as f:
        k = csv.writer(f, dialect="excel")
        k.writerow(['index:ID', 'type', ':LABEL'])
        for list in lists:
            # print(list)
            k.writerow(list)

def pa_legalCase(url,type,a,x):
    lists = []
    for i in range(1, 50):
        surl = url+"list_"+str(i)
        print(surl)
        response = requests.get(surl, headers=headers).text
        # print(response)
        soup = BeautifulSoup(response, 'html.parser')
        # print(soup)
        for item in soup.find_all(class_="list-main-h1 nowrap"):
            link=item.get('href')
            item = item.string
            print(item)
            l = [x, item, link, type]
            lists.append(l)
            x = x + 1
            # print(l)
        # print(lists)
    with open(f"data/{a}.csv", "w", encoding="utf-8", newline="") as f:
        k = csv.writer(f, dialect="excel")
        k.writerow(['index:ID', 'title', 'link', ":LABEL"])
        for list in lists:
            # print(list)
            k.writerow(list)

if __name__=="__main__":

    start_time=time.time()
    print("爬虫开始时间%s" %start_time)

    pa_menu()

    key = {"hunyin", "laodong", "baoxian", "msssf", "jicheng", "ywjyf", "ldhtf", "guanggao", "xfzqyf", "jiaoyu", "huanjing", "qita"}
    keyword = ""
    x=20001
    for a in key:
        type = ""
        if(a=="hunyin"):
            type="婚姻法"
        elif(a=="laodong"):
            type="劳动法"
        elif (a=="baoxian"):
            type="保险法"
        elif(a=="msssf"):
            type="民事诉讼法"
        elif(a=="jicheng"):
            type="继承法"
        elif(a=="ywjyf"):
            type="义务教育法"
        elif(a=="ldhtf"):
            type="劳动合同法"
        elif(a=="guanggao"):
            type="广告法"
        elif(a=="xfzqyf"):
            type="消费者权益法"
        elif(a=="jiaoyu"):
            type="教育法"
        elif(a=="huanjing"):
            type="环境法"
        else:
            type="其他"
        b="https://anli.lawtime.cn/mf"
        c="/"
        url=b+a+c
        pa_legalCase(url,type,a,x)
        x=x+10000

    end_time=time.time()
    print("共耗时%s" %(end_time-start_time))

 

posted @ 2022-05-29 12:13  哦心有  阅读(18)  评论(0编辑  收藏  举报