百度搜索当天收录采集

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# author:么么哒
import requests
import re

def Reptile():
    for num in range(0,750,10):
        with open('test.txt', 'r', encoding='utf-8') as f:
            for text in f.read().splitlines():
                target = 'https://www.baidu.com/s?wd={}&pn={}&ie=utf-8&gpc=stf%3D1658043774%2C1658130174%7Cstftype%3D1'.format(text,num)
                headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'}
                cookie = "你的cookie"
                cookie_dict = {i.split("=")[0]: i.split("=")[-1] for i in cookie.split("; ")} 
                r = requests.get(url=target,headers=headers,cookies=cookie_dict)
                meme = r.text
                pattern = re.compile(r'","urlDisplay":"(.*?)","urlEncoded":"')
                result = re.findall(pattern,meme)
                #print(result)
                print(target)
               
                

                with open('./baidu-today.txt','a+',encoding = 'utf-8') as f1:
                    for x in (result):
                        try:
                            pattern = re.compile(r'http(.*?)://([A-Za-z0-9]+[\-]?[A-Za-z0-9]+\.|[A-Za-z0-9]+\.)((\w|\?|\.|-)*)')
                            s = str(x)
                            print(s)
                            m =(pattern.search(s).group(0))
                            m = str(m)+'\r'
                            f1.write(m)
                        except Exception as e:
                            print (e)
             
            
def filter():
        try:
            with open('./baidu-today.txt', 'r') as f2:#打开文本过滤重复的url
                f_list = f2.readlines()
                set_list = list(set(f_list))
                set_list.sort(key=f_list.index)
                for mm in (set_list):
                    with open('./baidu-today去重后.txt','a+',encoding = 'utf-8') as f2:
                        f2.write(mm)
        except Exception as e:
            print (e)
        finally:
            print ("恭喜你 去重复结束!")
            
if __name__ == "__main__":
    Reptile()    
    filter()

 

posted @ 2022-07-18 15:53  射满东城湖  阅读(51)  评论(0编辑  收藏  举报