爬虫_获取文章标题链接

bs4--基本使用

相关脚本

博客园

获取博主的所有文章

# -*- coding: utf-8 -*-
# @Time    : 2019/7/30 1:04
# @Author  : hakim
# @File    : pq.py
import requests
from bs4 import BeautifulSoup

link = "http://www.cnblogs.com/planche/default.html"
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36',
    'Host': 'www.cnblogs.com'
}

if __name__ == '__main__':
    pos=1
    while(True):
        key_dict = {'page': str(pos)}
        r = requests.get(link, headers=headers,params=key_dict,timeout=1)
        # print (r.url)
        soup = BeautifulSoup(r.text, "html.parser")      #使用BeautifulSoup解析这段代码
        res=soup.find_all("a",class_="postTitle2")  # bs4.element.Tag

        # print(res)       #打印a标签所有含有class_="postTitle2"
        # print(type(res))

        if not len(res):exit(0)                #博客园定义了几乎无限个标签。所以在这里进行退出判断
        for Tag in res:
            title = Tag.text.strip()           # 使用strip自动删除字符串的前导空格
            print('['+title+']('+Tag['href']+')')
        pos=pos+1

获取博主标签的所有文章

# -*- coding: utf-8 -*-
# @Time    : 2019/7/30 1:04
# @Author  : hakim
# @File    : pq.py
import re
import requests
from bs4 import BeautifulSoup

link = "https://www.cnblogs.com/yoyoketang/tag/selenium/default.html?page="
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36',
    'Host': 'www.cnblogs.com'
}

if __name__ == '__main__':
    pos=1
    while(True):
        tag_links=link+str(pos)
        r = requests.get(tag_links, headers=headers,timeout=2)
        # print (r.url)
        # print (r.text)
        soup = BeautifulSoup(r.text, "html.parser")      #使用BeautifulSoup解析这段代码
        res=soup.find_all("a",id=re.compile("PostsList1_rpPosts_TitleUrl"))  # 查看id包含"PostsList1_rpPosts_TitleUrl"的所有a标签
        # print(res)       #打印a标签所有含有class_="postTitle2"
        # print(type(res))

        if not len(res):exit(0)                #博客园定义了几乎无限个标签。所以在这里进行退出判断
        for Tag in res:
            title = Tag.text.strip()           # 使用strip自动删除字符串的前导空格
            print('['+title+']('+Tag['href']+')')
        pos=pos+1

简书

获取专题文章列表

# -*- coding: utf-8 -*-
# @Time    : 2019/8/6 15:15
# comment  : 简书文章列表
# @Author  : hakim
# @File    : jianshu.py
from selenium import webdriver
import time


#无界面操作
options = webdriver.ChromeOptions()
options.add_argument('headless')
browser = webdriver.Chrome(chrome_options=options)

browser.get("https://www.jianshu.com/nb/24452703")

for i in range(3):
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)

# print(browser)
for j in range(10):
    try:
        button = browser.execute_script("var a = document.getElementsByClassName('load-more'); a[0].click();")
        time.sleep(2)
    except:
        pass
#
titles = browser.find_elements_by_class_name("title")
with open("article_jianshu.txt", "w", encoding="utf-8") as f:
    for t in titles:
        try:
            print(('['+t.text + "](" + t.get_attribute("href")+')'))
            f.write('['+t.text + "](" + t.get_attribute("href")+')')
            f.write("\n")
        except TypeError:
            pass

posted @ 2019-08-06 21:50  闪电恋  阅读(109)  评论(0)    收藏  举报