博客园

import requests
from lxml import etree
from bs4 import BeautifulSoup
from selenium  import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
import re
url_0 = "https://www.cnblogs.com/l10n/"
url = url_0 + "/?page=1"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
title_mima = "博文阅读密码验证"
url_mima = "https://www.cnblogs.com/l10n/p/17023036.html"
url_wumima = "https://www.cnblogs.com/l10n/p/7528845.html"
file_name = "aa.mhtml"

def is_in(full_str, sub_str):
    if re.findall(sub_str, full_str):
        return True
    else:
        return False



resp = requests.get(url_wumima)
resp.encoding = "UTF-8"
soup = BeautifulSoup(resp.text, 'lxml')
# print(soup.title.get_text())

aaa = "<title>博文阅读密码验证 - 博客园</title>"
if is_in(soup.title.get_text(),title_mima):
    print("有密码")
else:
    print("无密码")

resp = requests.get(url, headers = headers)
resp.encoding = "UTF-8"

# 输入URL为: page=1 ,获取页码数
def get_last_page_num(url):
    resp = requests.get(url)
    resp.encoding = "UTF-8"
    page = etree.HTML(resp.text)
    page_num = page.xpath('//*[@class="pager"]/a[5]/text()')
    if len(page_num) == 2 and page_num[0] == page_num[1]:
        return page_num[0]
    else:
        return null

# 输入URL为: page=1 ,获取博客地址
def get_page_links(url):
    resp = requests.get(url, headers=headers)
    resp.encoding = "UTF-8"
    html_tree = etree.HTML(resp.text)
    data = html_tree.xpath('//div[@class="postTitle"]/a/@href')
    return data

# 输入详细URL,获取博客title
def get_url_title(url):
    resp = requests.get(url, headers=headers)
    resp.encoding = "UTF-8"
    html_tree = etree.HTML(resp.text)
    data = html_tree.xpath('//a[@id="cb_post_title_url"]/span/text()')
    return data

# 保存为 mhtml
def save_mhtml(url,file_name):
    opt = Options()
    opt.add_argument("--headless")
    opt.add_argument('disable-infobars')
    driver = Chrome(options=opt)
    # driver.set_page_load_timeout(3)
    driver.get(url) # 这里速度很慢
    driver.find_element(By.ID, "tb_password").send_keys("1")
    driver.find_element(By.ID, "btn_submit").click()
    res = driver.execute_cdp_cmd('Page.captureSnapshot', {})
    with open(file_name, 'w', newline='') as f:
        f.write(res['data'])
    driver.quit()

 

posted @ 2023-08-18 19:11  屠魔的少年  阅读(4)  评论(0)    收藏  举报