import requests import re import pandas as pd def parse_page(url): headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299' } response = requests.get(url,headers=headers) text = response.text authors = re.findall(r'<div class="bg-img-green">.*?<h4>.*?<a.*?>(.*?)</a>',text,re.DOTALL) contents = re.findall(r'<p class=" reply-content">(.*?)</p>',text,re.DOTALL) commands = [] for command in contents: x = re.sub(r'<.*?>',"",command) commands.append(x.strip()) data={ '作者':authors, '评论':commands } #print(contents) df=pd.DataFrame(data) df.to_excel('23333.xlsx',encoding='utf-8') def main(): url = 'https://www.douban.com/group/topic/184693273/' parse_page(url) if __name__ == '__main__': main()
浙公网安备 33010602011771号