爬虫学习之BeautifulSoup

流程思路：
1.导入库　　

2.伪装头

3.接收get

4.引入BeautifulSoup

5.引用BeautifulSoup并选择解析器

6.循环遍历输出

7.编写主入口、构造多页函数。

不足：
接触到了进阶反爬虫：验证码

附上两个经典源码:

import time
import requests
from bs4 import BeautifulSoup
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}

def get_links(url):
    wb_data = requests.get(url,headers = headers)
    soup = BeautifulSoup(wb_data.text,'lxml')
    links = soup.select('#list-content > div > a')
    for link in links:
        href = link.get("href")
        get_info(href)
def get_info(url):
    wb_data = requests.get(url,headers = headers)
    soup = BeautifulSoup(wb_data.text,'lxml')
    names = soup.select('body > div.wrapper > div.mainbox.cf > div.rbox > div.broker-card > div.broker-border > h2')
    prices = soup.select('body > div.wrapper > div.mainbox.cf > div.lbox > ul.house-info-zufang.cf > li.full-line.cf > span.price > em')
    places = soup.select('#commArround')
    phones = soup.select('body > div.wrapper > div.mainbox.cf > div.rbox > div.broker-card > div.broker-mobile')
    for phone,name,place,price in zip(phones,names,places,prices):
        data= {
            '地址':place.get_text(),
            '月租':price.get_text(), 
            '电话':phone.get_text(),
            '姓名':name.get_text()
        }
        print(data)
            
  

if __name__ == '__main__':
    urls = ['https://xiang.zu.anjuke.com/fangyuan/p{0}/'.format(str(i))
            for i in range(1,4)]
    for single_url in urls:
        get_links(single_url)
        time.sleep(1)

import time
import requests
from bs4 import BeautifulSoup
headers ={
    "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
    }
def get_info(url):
    wb_data = requests.get(url,headers = headers)
    soup = BeautifulSoup(wb_data.text,'lxml')
    contents = soup.select('a > div > span')
    for content in contents:
        data =content.get_text()
        print(data)
if __name__ == '__main__':
    urls = ["https://www.qiushibaike.com/8hr/page/{0}".format(str(i))
         for i in range(1,10)]
    for url in urls:

        get_info(url)
        time.sleep(1)

posted on 2018-03-19 19:30 GhostAatrox 阅读(83) 评论(0) 收藏举报