bs4--基本使用
相关脚本
博客园
获取博主的所有文章
# -*- coding: utf-8 -*-
# @Time : 2019/7/30 1:04
# @Author : hakim
# @File : pq.py
import requests
from bs4 import BeautifulSoup
link = "http://www.cnblogs.com/planche/default.html"
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36',
'Host': 'www.cnblogs.com'
}
if __name__ == '__main__':
pos=1
while(True):
key_dict = {'page': str(pos)}
r = requests.get(link, headers=headers,params=key_dict,timeout=1)
# print (r.url)
soup = BeautifulSoup(r.text, "html.parser") #使用BeautifulSoup解析这段代码
res=soup.find_all("a",class_="postTitle2") # bs4.element.Tag
# print(res) #打印a标签所有含有class_="postTitle2"
# print(type(res))
if not len(res):exit(0) #博客园定义了几乎无限个标签。所以在这里进行退出判断
for Tag in res:
title = Tag.text.strip() # 使用strip自动删除字符串的前导空格
print('['+title+']('+Tag['href']+')')
pos=pos+1
获取博主标签的所有文章
# -*- coding: utf-8 -*-
# @Time : 2019/7/30 1:04
# @Author : hakim
# @File : pq.py
import re
import requests
from bs4 import BeautifulSoup
link = "https://www.cnblogs.com/yoyoketang/tag/selenium/default.html?page="
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36',
'Host': 'www.cnblogs.com'
}
if __name__ == '__main__':
pos=1
while(True):
tag_links=link+str(pos)
r = requests.get(tag_links, headers=headers,timeout=2)
# print (r.url)
# print (r.text)
soup = BeautifulSoup(r.text, "html.parser") #使用BeautifulSoup解析这段代码
res=soup.find_all("a",id=re.compile("PostsList1_rpPosts_TitleUrl")) # 查看id包含"PostsList1_rpPosts_TitleUrl"的所有a标签
# print(res) #打印a标签所有含有class_="postTitle2"
# print(type(res))
if not len(res):exit(0) #博客园定义了几乎无限个标签。所以在这里进行退出判断
for Tag in res:
title = Tag.text.strip() # 使用strip自动删除字符串的前导空格
print('['+title+']('+Tag['href']+')')
pos=pos+1
简书
获取专题文章列表
# -*- coding: utf-8 -*-
# @Time : 2019/8/6 15:15
# comment : 简书文章列表
# @Author : hakim
# @File : jianshu.py
from selenium import webdriver
import time
#无界面操作
options = webdriver.ChromeOptions()
options.add_argument('headless')
browser = webdriver.Chrome(chrome_options=options)
browser.get("https://www.jianshu.com/nb/24452703")
for i in range(3):
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
# print(browser)
for j in range(10):
try:
button = browser.execute_script("var a = document.getElementsByClassName('load-more'); a[0].click();")
time.sleep(2)
except:
pass
#
titles = browser.find_elements_by_class_name("title")
with open("article_jianshu.txt", "w", encoding="utf-8") as f:
for t in titles:
try:
print(('['+t.text + "](" + t.get_attribute("href")+')'))
f.write('['+t.text + "](" + t.get_attribute("href")+')')
f.write("\n")
except TypeError:
pass