#!/usr/bin/env python
# -*- coding: utf-8 -*-
# author:么么哒
import requests
import re
def Reptile():
for num in range(0,750,10):
with open('test.txt', 'r', encoding='utf-8') as f:
for text in f.read().splitlines():
target = 'https://www.baidu.com/s?wd={}&pn={}&ie=utf-8&gpc=stf%3D1658043774%2C1658130174%7Cstftype%3D1'.format(text,num)
headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'}
cookie = "你的cookie"
cookie_dict = {i.split("=")[0]: i.split("=")[-1] for i in cookie.split("; ")}
r = requests.get(url=target,headers=headers,cookies=cookie_dict)
meme = r.text
pattern = re.compile(r'","urlDisplay":"(.*?)","urlEncoded":"')
result = re.findall(pattern,meme)
#print(result)
print(target)
with open('./baidu-today.txt','a+',encoding = 'utf-8') as f1:
for x in (result):
try:
pattern = re.compile(r'http(.*?)://([A-Za-z0-9]+[\-]?[A-Za-z0-9]+\.|[A-Za-z0-9]+\.)((\w|\?|\.|-)*)')
s = str(x)
print(s)
m =(pattern.search(s).group(0))
m = str(m)+'\r'
f1.write(m)
except Exception as e:
print (e)
def filter():
try:
with open('./baidu-today.txt', 'r') as f2:#打开文本过滤重复的url
f_list = f2.readlines()
set_list = list(set(f_list))
set_list.sort(key=f_list.index)
for mm in (set_list):
with open('./baidu-today去重后.txt','a+',encoding = 'utf-8') as f2:
f2.write(mm)
except Exception as e:
print (e)
finally:
print ("恭喜你 去重复结束!")
if __name__ == "__main__":
Reptile()
filter()