python爬虫

import requests
from bs4 import BeautifulSoup
import time
import random
import jieba
from collections import Counter
import re

更完善的请求头

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Referer': 'https://www.douban.com/',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}

豆瓣书籍ID映射（直接使用ID避免搜索失败）

BOOK_IDS = {
'平凡的世界': '1084165',
'活着': '1082154',
'白夜行': '10554308'
}

def get_comments(book_name, sort='hot', page_limit=3):
"""获取豆瓣图书评论"""
book_id = BOOK_IDS.get(book_name)
if not book_id:
print(f"未找到书籍《{book_name}》的ID")
return []

base_url = f'https://book.douban.com/subject/{book_id}/comments'
comments = []

for page in range(page_limit):
    try:
        # 添加随机延迟
        time.sleep(random.uniform(3, 6))

        params = {
            'sort': 'new_score' if sort == 'hot' else 'time',
            'start': page * 20
        }

        response = requests.get(base_url, headers=headers, params=params, timeout=15)
        response.encoding = 'utf-8'

        # 检查是否被重定向到登录页
        if 'accounts.douban.com' in response.url:
            print("需要登录才能查看评论，请添加cookie")
            return []

        soup = BeautifulSoup(response.text, 'html.parser')

        # 新版豆瓣评论选择器
        comment_items = soup.select('div.comment-item')
        if not comment_items:
            comment_items = soup.select('li.comment-item')  # 备用选择器

        for item in comment_items:
            try:
                user = item.select_one('.comment-info a').get_text(strip=True)
                rating = item.select_one('.rating')['title'] if item.select_one('.rating') else '无评分'
                comment_time = item.select_one('.comment-time').get_text(strip=True)
                content = item.select_one('.short').get_text(strip=True)
                votes = item.select_one('.vote-count').get_text(strip=True) if item.select_one(
                    '.vote-count') else '0'

                comments.append({
                    '用户名': user,
                    '评分': rating,
                    '评论时间': comment_time,
                    '短评内容': content,
                    '点赞数': int(votes)
                })
            except Exception as e:
                continue

        print(f"已获取《{book_name}》第{page + 1}页评论（{sort}）")

    except Exception as e:
        print(f"获取第{page + 1}页评论时出错: {str(e)}")
        continue

return comments

def display_comments(comments, title):
"""显示评论信息"""
if not comments:
print(f"{title}: 无数据")
return

print(f"\n{title}:")
print("-" * 120)
print(f"{'序号':<5}{'用户名':<20}{'评分':<10}{'点赞数':<10}{'评论时间':<25}{'短评内容':<50}")
print("-" * 120)

for i, comment in enumerate(comments[:10], 1):
    print(f"{i:<5}{comment['用户名'][:18]:<20}{comment['评分']:<10}{comment['点赞数']:<10}"
          f"{comment['评论时间'][:24]:<25}{comment['短评内容'][:48]:<50}")
print("-" * 120)

def analyze_comments(comments):
"""分析评论内容"""
if not comments:
return

# 合并所有评论内容
text = ' '.join([c['短评内容'] for c in comments])

# 使用jieba分词
words = jieba.lcut(text)

# 过滤停用词
stop_words = ['的', '了', '和', '是', '在', '我', '有', '也', '都', '这', '就', '要', '不', '人', '会']
words = [w for w in words if len(w) > 1 and w not in stop_words]

# 统计词频
word_counts = Counter(words).most_common(10)

print("\n高频词汇Top10:")
for word, count in word_counts:
    print(f"{word}: {count}次")

def main():
print("请选择要分析的书籍:")
books = list(BOOK_IDS.keys())
for i, book in enumerate(books, 1):
print(f"{i}. {book}")

try:
    choice = int(input("请输入选择(1-3): ")) - 1
    book_name = books[choice]
except:
    book_name = '平凡的世界'

# 获取热门评论
print(f"\n开始获取《{book_name}》热门评论...")
hot_comments = get_comments(book_name, 'hot')
display_comments(hot_comments, "热门评论Top10")

# 获取最新评论
print(f"\n开始获取《{book_name}》最新评论...")
new_comments = get_comments(book_name, 'new')
display_comments(new_comments, "最新评论Top10")

# 分析评论
all_comments = hot_comments + new_comments
if all_comments:
    # 按点赞数排序
    top_voted = sorted(all_comments, key=lambda x: x['点赞数'], reverse=True)[:10]
    display_comments(top_voted, "点赞数Top10")

    # 文本分析
    analyze_comments(all_comments)
else:
    print("\n未能获取任何评论数据，可能原因：")
    print("1. 豆瓣反爬机制限制，请稍后再试")
    print("2. 需要登录才能查看（请添加cookie）")
    print("3. 网络连接问题")

if name == "main":
# 设置jieba分词
jieba.setLogLevel(jieba.logging.INFO)

main()

posted @ 2025-06-04 22:52 曹明阳阅读(11) 评论(0) 收藏举报

刷新页面返回顶部

dahais4c

python爬虫

更完善的请求头

豆瓣书籍ID映射（直接使用ID避免搜索失败）

公告