from bs4 import BeautifulSoup
import time
import re
import datetime
import requests
import csv
import pandas as pd
headers = {
'authority': 'item.jd.com',
'cache-control': 'max-age=0',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="90", "Google Chrome";v="90"',
'sec-ch-ua-mobile': '?0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 FS',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'same-site',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'referer': 'https://search.jd.com/',
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,ja;q=0.6',
'cookie': '__jdu=16196590092911498836650; shshshfpa=a828a369-a728-e182-048e-b5072447be6e-1619659011; shshshfpb=tOCI1z%20PGPaR3CEfad3X5Hg%3D%3D; pinId=ra4DldyvyoY_wKdMfg-NzQ; pin=jd_vbaqeJhuCzNB; unick=jd_vbaqeJhuCzNB; _tp=nTy%2BrzzrxqSaQB0srQMkNg%3D%3D; _pst=jd_vbaqeJhuCzNB; cn=124; ipLocation=%u6cb3%u5317; areaId=5; ipLoc-djd=5-248-2990-51290; __jdv=76161171|baidu|-|organic|not set|1624541667990; PCSYCityID=CN_130000_130100_130104; user-key=b71f1c66-2098-455c-a8d6-18761218a06f; TrackID=1GQVBVkoDl_4bfjMShj3z5E2QZTbfh56Eyq2y7QsR13GdfYydXHBeQpKI2oGOZnJNbiyyJqt5Qr69BhDridyaSEtCAvnCqeOxoZA1-uw_7L7c5mYqqvF-HNhckGxBUD_2; shshshfp=4e3578329e37ee251bff8051f7e59ea5; __jda=122270672.16196590092911498836650.1619659009.1624285397.1624541668.33; __jdc=122270672; shshshsID=204213ee80eea633cacd5e34507f5018_29_1624545894631; __jdb=122270672.61.16196590092911498836650|33.1624541668; 3AB9D23F7A4B3C9B=Z5VP7UVC6XSUTN77RWVU3MCMKNU3B47D7TIAVQK2GSP2JKY22AOOH4FHZMIUEZFND67AAM6RO3OUTVSPKHBUXGEYHI',
'if-modified-since': 'Thu, 24 Jun 2021 14:44:50 GMT',
}
def pa_menu():
url = "https://anli.lawtime.cn/minfa/"
response = requests.get(url, headers=headers).text
# print(response)
soup = BeautifulSoup(response, 'html.parser')
# print(soup)
i = 10001
lists = []
for item in soup.find_all(class_="db list-nav-a subItem active"):
item = item.string
item = item[:-2]
print(item)
l = [i, item, "民法"]
lists.append(l)
i = i + 1
for item in soup.find_all(class_="db list-nav-a subItem"):
item = item.string
item = item[:-2]
# print(item)
l = [i, item, "民法"]
lists.append(l)
i = i + 1
# print(l)
# print(lists)
with open("data/menu.csv", "w", encoding="utf-8", newline="") as f:
k = csv.writer(f, dialect="excel")
k.writerow(['index:ID', 'type', ':LABEL'])
for list in lists:
# print(list)
k.writerow(list)
def pa_legalCase(url,type,a,x):
lists = []
for i in range(1, 50):
surl = url+"list_"+str(i)
print(surl)
response = requests.get(surl, headers=headers).text
# print(response)
soup = BeautifulSoup(response, 'html.parser')
# print(soup)
for item in soup.find_all(class_="list-main-h1 nowrap"):
link=item.get('href')
item = item.string
print(item)
l = [x, item, link, type]
lists.append(l)
x = x + 1
# print(l)
# print(lists)
with open(f"data/{a}.csv", "w", encoding="utf-8", newline="") as f:
k = csv.writer(f, dialect="excel")
k.writerow(['index:ID', 'title', 'link', ":LABEL"])
for list in lists:
# print(list)
k.writerow(list)
if __name__=="__main__":
start_time=time.time()
print("爬虫开始时间%s" %start_time)
pa_menu()
key = {"hunyin", "laodong", "baoxian", "msssf", "jicheng", "ywjyf", "ldhtf", "guanggao", "xfzqyf", "jiaoyu", "huanjing", "qita"}
keyword = ""
x=20001
for a in key:
type = ""
if(a=="hunyin"):
type="婚姻法"
elif(a=="laodong"):
type="劳动法"
elif (a=="baoxian"):
type="保险法"
elif(a=="msssf"):
type="民事诉讼法"
elif(a=="jicheng"):
type="继承法"
elif(a=="ywjyf"):
type="义务教育法"
elif(a=="ldhtf"):
type="劳动合同法"
elif(a=="guanggao"):
type="广告法"
elif(a=="xfzqyf"):
type="消费者权益法"
elif(a=="jiaoyu"):
type="教育法"
elif(a=="huanjing"):
type="环境法"
else:
type="其他"
b="https://anli.lawtime.cn/mf"
c="/"
url=b+a+c
pa_legalCase(url,type,a,x)
x=x+10000
end_time=time.time()
print("共耗时%s" %(end_time-start_time))