from bs4 import BeautifulSoup
import requests
link_list = []
def get_soup(url): #获取网页的HTML文件,并用BeautifulSoup做成soup
html = requests.get(url)
soup = BeautifulSoup(html.text,'lxml')
return soup
def get_link_list(url,soup): #获取每个租房的url,并做成list
links = get_soup(url).select('#page_list > ul > li > a')
for link in links:
link_list.append(link['href'])
return link_list
def get_content(): #获取每个租房页面的基本信息
for index_url in get_link_list(url,get_soup(url)):
get_soup(index_url)
title = get_soup(index_url).select('div.pho_info > h4 > em')
price = get_soup(index_url).select('#pricePart > div.day_l > span')
image = get_soup(index_url).select('#curBigImage')
name = get_soup(index_url).select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')
addr = get_soup(index_url).select(' div.pho_info > p')
data = {
'name' : name[0].string,
'title' : title[0].string,
'addr' : addr[0]['title'], #因为有的客房没有地址,所以通过获取title标签属性来获得地址
'price' : price[0].string,
'image' : image[0]['src']
}
print(data)
for i in range(1):
url = 'http://hz.xiaozhu.com/search-duanzufang-p%d-0/' %i
get_content()