流程思路:
1.导入库
2.伪装头
3.接收get
4.引入BeautifulSoup
5.引用BeautifulSoup并选择解析器
6.循环遍历输出
7.编写主入口、构造多页函数。
不足:
接触到了进阶反爬虫:验证码
附上两个经典源码:
import time import requests from bs4 import BeautifulSoup headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' } def get_links(url): wb_data = requests.get(url,headers = headers) soup = BeautifulSoup(wb_data.text,'lxml') links = soup.select('#list-content > div > a') for link in links: href = link.get("href") get_info(href) def get_info(url): wb_data = requests.get(url,headers = headers) soup = BeautifulSoup(wb_data.text,'lxml') names = soup.select('body > div.wrapper > div.mainbox.cf > div.rbox > div.broker-card > div.broker-border > h2') prices = soup.select('body > div.wrapper > div.mainbox.cf > div.lbox > ul.house-info-zufang.cf > li.full-line.cf > span.price > em') places = soup.select('#commArround') phones = soup.select('body > div.wrapper > div.mainbox.cf > div.rbox > div.broker-card > div.broker-mobile') for phone,name,place,price in zip(phones,names,places,prices): data= { '地址':place.get_text(), '月租':price.get_text(), '电话':phone.get_text(), '姓名':name.get_text() } print(data) if __name__ == '__main__': urls = ['https://xiang.zu.anjuke.com/fangyuan/p{0}/'.format(str(i)) for i in range(1,4)] for single_url in urls: get_links(single_url) time.sleep(1)
import time import requests from bs4 import BeautifulSoup headers ={ "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36" } def get_info(url): wb_data = requests.get(url,headers = headers) soup = BeautifulSoup(wb_data.text,'lxml') contents = soup.select('a > div > span') for content in contents: data =content.get_text() print(data) if __name__ == '__main__': urls = ["https://www.qiushibaike.com/8hr/page/{0}".format(str(i)) for i in range(1,10)] for url in urls: get_info(url) time.sleep(1)
总有一个理由,会让我们开始变强。
浙公网安备 33010602011771号