借助 GitHub Workflow 定时获取博客状态

由这篇博文启发，我们可以通过 GitHub 的工作流让代码跑在 GitHub 上，并将获得的数据存在仓库中。

首先是写 Python 脚本，这部分比较简单。不难找到几个关键的请求链接 https://www.cnblogs.com/XuYueming/ajax/GetPostStat，/blog-stats，/news，/sidecolumn.aspx，用 requests 请求后用 BeautifulSoup 解析即可。然后通过 json 保存下来。

import requests
import json
from datetime import datetime, UTC
from pathlib import Path
from bs4 import BeautifulSoup

BASE_URL = "https://www.cnblogs.com/XuYueming/ajax"
OUTPUT_PATH = Path("data")
INTERESTED_BLOGS = [18313014, 18397758]


def fetch_html(url: str) -> BeautifulSoup:
    resp = requests.get(url, timeout=10)
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "html.parser")


def extract_int_by_id(soup: BeautifulSoup, element_id: str) -> int:
    el = soup.find(id=element_id)
    if not el:
        return 0
    text = el.get_text(strip=True)
    digits = "".join(ch for ch in text if ch.isdigit())
    return int(digits) if digits else 0


def fetch_blog_info(blog_ids: list[int]) -> dict:
    resp = requests.post(f'{BASE_URL}/GetPostStat',
                         data=json.dumps(blog_ids),
                         headers={"Content-Type": "application/json; charset=utf-8"})
    resp.raise_for_status()
    return resp.json()


def fetch_blog_stats() -> dict:
    soup = fetch_html(f"{BASE_URL}/blog-stats")
    return {
        "post_count": extract_int_by_id(soup, "stats_post_count"),
        "article_count": extract_int_by_id(soup, "stats_article_count"),
        "comment_count": extract_int_by_id(soup, "stats-comment_count"),
        "view_count": extract_int_by_id(soup, "stats-total-view-count"),
    }


def fetch_news() -> dict:
    soup = fetch_html(f"{BASE_URL}/news")
    profile_div = soup.find(id="profile_block")
    if not profile_div:
        return {"nickname": "", "join_age": "", "fans": 0, "follow": 0}

    a_tags = profile_div.find_all("a")
    nickname = a_tags[0].get_text(strip=True) if len(a_tags) > 0 else ""
    join_age = a_tags[1].get_text(strip=True) if len(a_tags) > 1 else ""

    fans_tag = profile_div.find("a", class_="follower-count")
    fans = int(fans_tag.get_text(strip=True)) if fans_tag and fans_tag.get_text(
        strip=True).isdigit() else 0

    follow_tag = profile_div.find("a", class_="folowing-count")
    follow = int(follow_tag.get_text(
        strip=True)) if follow_tag and follow_tag.get_text(strip=True).isdigit() else 0

    return {
        "nickname": nickname,
        "join_age": join_age,
        "fans": fans,
        "follow": follow
    }


def fetch_sidecolumn() -> dict:
    soup = fetch_html(f"{BASE_URL}/sidecolumn.aspx")
    data = {}

    # recent_posts = []
    # ul = soup.select_one("#sidebar_recentposts ul")
    # if ul:
    #     for li in ul.find_all("li"):
    #         a = li.find("a")
    #         if a:
    #             recent_posts.append({"title": a.get_text(strip=True), "link": a.get("href")})
    # data["recent_posts"] = recent_posts

    # tags = []
    # ul = soup.select_one("#sidebar_toptags ul")
    # if ul:
    #     for li in ul.find_all("li"):
    #         a = li.find("a")
    #         if a and "更多" not in a.get_text():
    #             count_span = li.find("span", class_="tag-count")
    #             count = int(count_span.get_text(
    #                 strip=True).strip("()")) if count_span else 0
    #             tags.append({
    #                 "name": a.get_text(strip=True).replace(f"({count})", ""),
    #                 "count": count,
    #                 # "link": a.get("href")
    #             })
    # data["tags"] = tags

    # collections = []
    # for div in soup.select("#sidebar_categories .catList"):
    #     title = div.select_one(".catListTitle")
    #     if title:
    #         title_text = title.get_text(strip=True).split("(")[0]
    #         items = []
    #         for a in div.select("ul li a"):
    #             items.append({"name": a.get_text(strip=True), "link": a.get("href")})
    #         collections.append({"title": title_text, "items": items})
    # data["collections"] = collections

    # archives = []
    # archive_div = soup.select_one("#sidebar_postarchive ul")
    # if archive_div:
    #     for a in archive_div.find_all("a"):
    #         archives.append({"name": a.get_text(strip=True), "link": a.get("href")})
    # data["archives"] = archives

    # recent_comments = []
    # comment_block = soup.select_one("#sidebar_recentcomments .RecentCommentBlock ul")
    # if comment_block:
    #     items = comment_block.find_all(recursive=False)
    #     for i in range(0, len(items), 3):
    #         title_li = items[i]
    #         body_li = items[i + 1] if i + 1 < len(items) else None
    #         author_li = items[i + 2] if i + 2 < len(items) else None
    #         if title_li and body_li and author_li:
    #             a = title_li.find("a")
    #             title = a.get_text(strip=True) if a else ""
    #             link = a.get("href") if a else ""
    #             content = body_li.get_text(strip=True)
    #             author = author_li.get_text(strip=True).lstrip("--")
    #             recent_comments.append({"title": title, "link": link, "content": content, "author": author})
    # data["recent_comments"] = recent_comments

    score_rank = {}
    ul = soup.select_one("#sidebar_scorerank ul")
    if ul:
        score_li = ul.find("li", class_="liScore")
        rank_li = ul.find("li", class_="liRank")
        score = int("".join(ch for ch in score_li.get_text()
                    if ch.isdigit())) if score_li else 0
        rank = int("".join(ch for ch in rank_li.get_text()
                   if ch.isdigit())) if rank_li else 0
        score_rank = {"score": score, "rank": rank}
    data["score_rank"] = score_rank

    return data


def main():
    snapshot = {
        "fetched_at": datetime.now(UTC).isoformat() + "Z",
        "blog_stats": fetch_blog_stats(),
        "news": fetch_news(),
        "sidecolumn": fetch_sidecolumn(),
        "interested_blogs:": fetch_blog_info(INTERESTED_BLOGS)
    }

    OUTPUT_PATH.mkdir(exist_ok=True)

    timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
    output_file = OUTPUT_PATH / f"cnblogs_snapshot_{timestamp}.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(snapshot, f, ensure_ascii=False, indent=2)

    print("blog snapshot saved:", output_file)


if __name__ == "__main__":
    main()

保存的 json 长这样：

{
  "fetched_at": "2025-11-26T14:07:43.823701+00:00Z",
  "blog_stats": {
    "post_count": 130,
    "article_count": 0,
    "comment_count": 76,
    "view_count": 10033
  },
  "news": {
    "nickname": "XuYueming",
    "join_age": "1年9个月",
    "fans": 24,
    "follow": 5
  },
  "sidecolumn": {
    "score_rank": {
      "score": 12067,
      "rank": 111361
    }
  },
  "interested_blogs:": [
    {
      "postId": 18313014,
      "viewCount": 704,
      "feedbackCount": 1,
      "diggCount": 10,
      "buryCount": 0
    },
    {
      "postId": 18397758,
      "viewCount": 1080,
      "feedbackCount": 15,
      "diggCount": 11,
      "buryCount": 0
    }
  ]
}

然后我们要通过 GitHub workflow 进行上传到仓库，新建 .github/workflows/fetch_blog_stats.yml，里面就可以自定义流程了：

name: Fetch Blog Stats

on:
  workflow_dispatch:
  schedule:
    - cron: '0 0 * * *'

jobs:
  run-python:
    runs-on: ubuntu-latest

    steps:
      - name: Checkout repository
        uses: actions/checkout@v3
        with:
          persist-credentials: true

      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: '3.11'

      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install -r requirements.txt

      - name: Run Python script
        run: python scripts/fetch_blog_stats.py

      - name: Commit and push changes
        run: |
          git config --global user.name "github-actions[bot]"
          git config --global user.email "github-actions[bot]@users.noreply.github.com"
          git add data/
          git commit -m "chore: auto-update by workflow" || echo "No changes to commit"
          git push origin HEAD

不要忘记在 settings 里把 Workflow permissions 改成 Read and write permissions，第一次忘开导致工作流失败了。

然后就结束了，可以在我的 github 仓库查看，该项目在 MIT 协议下开源。

posted @ 2025-11-27 14:52 XuYueming 阅读(11) 评论(0) 收藏举报

刷新页面返回顶部

XuYueming

借助 GitHub Workflow 定时获取博客状态

公告