Python爬取 MT论坛主题帖,小批量抓取想看的主题
import requests from bs4 import BeautifulSoup from fake_useragent import UserAgent import random from lxml import etree t = UserAgent(verify_ssl=False).random #print(t) headers={ 'User-Agent':t } for i in range(493,496): #页码数 url = 'https://bbs.binmt.cc/forum.php?mod=forumdisplay&fid=40&page={}'.format(i) #前面自己修改不同帖子地址 t = requests.get(headers=headers, url=url).text ''' soup = BeautifulSoup(t,'html.parser') with open('wangzhan.txt','a',encoding='utf-8') as f: f.write(t) with open('wangzhan.txt','r',encoding='utf-8') as f: html= f.read() soup = BeautifulSoup(html,'html.parser') ''' selector = etree.HTML(t) zhuti = selector.xpath('//*[@class="comiis_postlist cl"]/h2/span/a[2]//text()') zhuban = selector.xpath('//*[@class="comiis_infotit cl"]/h1/a//text()') zhuban = ''.join(zhuban) dizhi = selector.xpath('//*[@class="comiis_postlist cl"]/h2/span/a[2]//@href') # print(zhuban) # print(zhuti) with open('D:\Study\pythonProject\scrapy\paqu_mt_luntan\mt_luntan\mt\{}.txt'.format(zhuban), 'a',encoding='utf-8') as f: for i in range(len(zhuti)): f.write(zhuti[i] + ',' + dizhi[i]) f.write('\n') ''' #print(soup.prettify()) items = soup.find_all(class_='comiis_fl_g') for item in items: #kind = item.find('a') #print(kind) text= item.select('a') for i in text: print(i.get_text()) #print(text) ''' # print(soup.find_all('a')) # print(soup.find_all("a", attrs={"class": "md-opjjpmhoiojifppkkcdabiobhakljdgm_doc"}))
如果人生还有重来,那就不叫人生。

浙公网安备 33010602011771号