5.14

python爬虫训练豆瓣评论爬取
• 所花时间:2
• 代码行数:307
• 博客容量:1
• 代码如下:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

# 设置请求头,防止被反爬
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}


# 获取页面数据
def get_page(url):
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.text
    else:
        return None


# 解析页面数据
def parse_page(html):
    soup = BeautifulSoup(html, 'html.parser')
    comments = soup.find_all('li', class_='comment-item')
    data = []
    for comment in comments:
        user = comment.find('a', title=True).text.strip() if comment.find('a', title=True) else 'Unknown'
        content = comment.find('span', class_='short').text.strip() if comment.find('span',
                                                                                    class_='short') else 'No content'
        time = comment.find('span', class_='comment-time').text.strip() if comment.find('span',
                                                                                        class_='comment-time') else 'Unknown'
        vote = int(comment.find('span', class_='vote-count').text.strip()) if comment.find('span',
                                                                                           class_='vote-count') else 0
        rating_tag = comment.find('span', class_='user-stars')
        rating = rating_tag['title'] if rating_tag else '无评分'
        location = comment.find('span', class_='comment-location').text.strip() if comment.find('span',
                                                                                                class_='comment-location') else 'Unknown'
        data.append({
            'user': user,
            'content': content,
            'time': time,
            'vote': vote,
            'rating': rating,
            'location': location
        })
    return data


# 爬取数据
def scrape_comments(url, pages=3):
    all_data = []
    for i in range(pages):
        page_url = f"{url}&start={i * 20}"
        html = get_page(page_url)
        if html:
            data = parse_page(html)
            all_data.extend(data)
    return all_data


# 获取前10条评论
def get_top_comments(comments, n=10):
    sorted_comments = sorted(comments, key=lambda x: x['vote'], reverse=True)
    return sorted_comments[:n]


# 生成词云图
def generate_wordcloud(text, filename='wordcloud.jpg', mask_path=None):
    words = jieba.lcut(text)
    new_text = ' '.join(words)
    mask_img = np.array(Image.open(mask_path)) if mask_path else None
    wordcloud = WordCloud(width=680, height=680, font_path=r'C:\Windows\Fonts\simkai.ttf', mask=mask_img).generate(
        new_text)
    wordcloud.to_file(filename)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.margins(x=0, y=0)
    plt.show()


# 主程序
def main():
    # URL示例,可以根据需求调整
    hot_url = 'https://book.douban.com/subject/10517238/comments/?sort=score&status=P'
    latest_url = 'https://book.douban.com/subject/10517238/comments/?sort=time&status=P'

    # 爬取数据
    print("正在爬取热门评论...")
    hot_comments = scrape_comments(hot_url)
    print("正在爬取最新评论...")
    latest_comments = scrape_comments(latest_url)

    # 获取前10条评论
    top_hot_comments = get_top_comments(hot_comments)
    top_latest_comments = get_top_comments(latest_comments)

    # 输出前10条评论
    print("热门排序前10位短评信息:")
    for comment in top_hot_comments:
        print(comment)

    print("\n最新排序前10位短评信息:")
    for comment in top_latest_comments:
        print(comment)

    # 生成词云图
    all_content = ' '.join([comment['content'] for comment in hot_comments])
    generate_wordcloud(all_content, 'hot_comments_wordcloud.jpg', 'data/mask.png')

    all_content_latest = ' '.join([comment['content'] for comment in latest_comments])
    generate_wordcloud(all_content_latest, 'latest_comments_wordcloud.jpg', 'data/mask.png')


if __name__ == "__main__":
    main()


posted @ 2024-05-14 14:57  aallofitisst  阅读(7)  评论(0)    收藏  举报