5.14
python爬虫训练豆瓣评论爬取
• 所花时间:2
• 代码行数:307
• 博客容量:1
• 代码如下:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
# 设置请求头,防止被反爬
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
# 获取页面数据
def get_page(url):
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
else:
return None
# 解析页面数据
def parse_page(html):
soup = BeautifulSoup(html, 'html.parser')
comments = soup.find_all('li', class_='comment-item')
data = []
for comment in comments:
user = comment.find('a', title=True).text.strip() if comment.find('a', title=True) else 'Unknown'
content = comment.find('span', class_='short').text.strip() if comment.find('span',
class_='short') else 'No content'
time = comment.find('span', class_='comment-time').text.strip() if comment.find('span',
class_='comment-time') else 'Unknown'
vote = int(comment.find('span', class_='vote-count').text.strip()) if comment.find('span',
class_='vote-count') else 0
rating_tag = comment.find('span', class_='user-stars')
rating = rating_tag['title'] if rating_tag else '无评分'
location = comment.find('span', class_='comment-location').text.strip() if comment.find('span',
class_='comment-location') else 'Unknown'
data.append({
'user': user,
'content': content,
'time': time,
'vote': vote,
'rating': rating,
'location': location
})
return data
# 爬取数据
def scrape_comments(url, pages=3):
all_data = []
for i in range(pages):
page_url = f"{url}&start={i * 20}"
html = get_page(page_url)
if html:
data = parse_page(html)
all_data.extend(data)
return all_data
# 获取前10条评论
def get_top_comments(comments, n=10):
sorted_comments = sorted(comments, key=lambda x: x['vote'], reverse=True)
return sorted_comments[:n]
# 生成词云图
def generate_wordcloud(text, filename='wordcloud.jpg', mask_path=None):
words = jieba.lcut(text)
new_text = ' '.join(words)
mask_img = np.array(Image.open(mask_path)) if mask_path else None
wordcloud = WordCloud(width=680, height=680, font_path=r'C:\Windows\Fonts\simkai.ttf', mask=mask_img).generate(
new_text)
wordcloud.to_file(filename)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.margins(x=0, y=0)
plt.show()
# 主程序
def main():
# URL示例,可以根据需求调整
hot_url = 'https://book.douban.com/subject/10517238/comments/?sort=score&status=P'
latest_url = 'https://book.douban.com/subject/10517238/comments/?sort=time&status=P'
# 爬取数据
print("正在爬取热门评论...")
hot_comments = scrape_comments(hot_url)
print("正在爬取最新评论...")
latest_comments = scrape_comments(latest_url)
# 获取前10条评论
top_hot_comments = get_top_comments(hot_comments)
top_latest_comments = get_top_comments(latest_comments)
# 输出前10条评论
print("热门排序前10位短评信息:")
for comment in top_hot_comments:
print(comment)
print("\n最新排序前10位短评信息:")
for comment in top_latest_comments:
print(comment)
# 生成词云图
all_content = ' '.join([comment['content'] for comment in hot_comments])
generate_wordcloud(all_content, 'hot_comments_wordcloud.jpg', 'data/mask.png')
all_content_latest = ' '.join([comment['content'] for comment in latest_comments])
generate_wordcloud(all_content_latest, 'latest_comments_wordcloud.jpg', 'data/mask.png')
if __name__ == "__main__":
main()
浙公网安备 33010602011771号