爬取小猪短租的租房信息,爬取第1~4页房屋,共96个房子的信息,包括标题、地址、租金、房东姓名、图片链接等。
使用requests和BeautifulSoup。
小猪短租:http://bj.xiaozhu.com/房源详情页:http://bj.xiaozhu.com/fangzi/134350372103.html
import requests
from bs4 import BeautifulSoup
import pandas as pd
# 获取单个房子的相关信息,返回dict
def get_fangzi_info(fangzi_url):
raw_page = requests.get(fangzi_url).text
soup = BeautifulSoup(raw_page,'lxml')
title = soup.select('div.pho_info>h4')[0].get_text().strip()
address = soup.select('div.pho_info > p > span')[0].get_text().strip()
price = soup.select('div.day_l')[0].get_text().strip()
img_link = soup.select('img[id="curBigImage"]')[0].get('src').strip()
fangdong_link = soup.select('div.member_pic>a>img')[0].get('src').strip()
fangdong_name = soup.select('h6 > a')[0].get_text().strip()
fangdong_sex_eles = soup.select('span.member_boy_ico')
if fangdong_sex_eles:
fangdong_sex = '男'
else:
fangdong_sex = '女'
fangzi_dict = {
'标题':title,
'地址':address,
'日租金':price,
'房子图片':img_link,
'房东姓名':fangdong_name,
'房东性别':fangdong_sex,
'房东图片':fangdong_link
}
# print(fangzi_dict)
return fangzi_dict
# 根据给定的多房屋概览页面,找出页面上所有房子的详情页链接,返回详情页链接的列表
def get_fangzi_urls(multi_fangzi_url):
raw_page = requests.get(multi_fangzi_url).text
soup = BeautifulSoup(raw_page,'lxml')
fangzi_eles = soup.select('li[lodgeunitid]>a')
fangzi_urls = []
for f in fangzi_eles:
fangzi_url = f.get('href')
fangzi_urls.append(fangzi_url)
# print(fangzi_urls)
# print('len(fangzi_urls):',len(fangzi_urls))
return fangzi_urls
index_url = 'http://bj.xiaozhu.com/'
multi_fangzi_urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(2,5)] #第2、3、4页
# print(multi_fangzi_urls)
fangzi_dicts = []
# 先爬取首页的房子信息
first_page_urls = get_fangzi_urls(index_url) # 首页24个房子的url
for fangzi_url in first_page_urls:
fangzi_dict = get_fangzi_info(fangzi_url)
fangzi_dicts.append(fangzi_dict)
# 再爬取第2页以后的房子信息
for multi_fangzi_url in multi_fangzi_urls: # multi_fangzi_url是首页后的某一页
post_page_urls = get_fangzi_urls(multi_fangzi_url) # 获取到首页后的某一页上的24个房子的url
for fangzi_url in post_page_urls:
fangzi_dict = get_fangzi_info(fangzi_url)
fangzi_dicts.append(fangzi_dict)
print(fangzi_dicts)
print('len(fangzi_dicts):',len(fangzi_dicts))
df = pd.DataFrame(fangzi_dicts)
df.to_csv('./xiaozhu.csv', index=False, mode='w', encoding='utf-8-sig')