爬虫案例安居客信息网
作业:安居客信息网
爬取网址url https://hf.fang.anjuke.com/loupan/all/
一页数据的爬取
driver = webdriver.Chrome()
driver.get('https://hf.fang.anjuke.com/?from=HomePage_TopBar')
home_list = driver.find_elements_by_xpath("//div[@class='key-list imglazyload']/div")
for home in home_list:
home_text = home.find_element_by_xpath(".//div[@class='infos']").text
# 价格的获取
try:
price = home.find_element_by_xpath(".//a[@class='favor-pos']/p/span").text
except:
#价格标签发生改变
try:
price = home.find_element_by_xpath(".//p[@class='favor-tag around-price']/span").text
except:
price = 0
price_list.append(price)
# print(home_text)
home_list = str(home_text).split('\n')
#去除无关数据
try:
home_list.remove('微信扫码拨号')
except:
pass
name_list.append(home_list[0])
place_list.append(home_list[1])
#房屋类型和面积的缺失
try:
home_mian = home_list[2].split(' ')
home_type = home_mian[1]
area_list_s = home_mian[2].split(':')
area = area_list_s[1]
home_type_list.append(home_type)
area_list.append(area)
except:
home_type_list.append('--未知--')
area_list.append('--未知--')
state_list.append(home_list[3])
particular = '|'.join(home_list[4:-1])
particular_list.append(particular)
print(f'名字:{home_list[0]}, 地点:{home_list[1]},价格:{price},户型:{home_type},'
f'建筑面积:{area}. 状态{home_list[3]}, 特点{particular}')
time.sleep(0.1)
获取前十页数据,将获取到的数据封装到字典中,再将数据转化为DeteFrame类型中,再将数据保存到csv文件中
from selenium import webdriver
import time
import re
import pandas as pd
def get_one_page():
home_list = driver.find_elements_by_xpath("//div[@class='key-list imglazyload']/div")
for home in home_list:
home_text = home.find_element_by_xpath(".//div[@class='infos']").text
# 价格的获取
try:
price = home.find_element_by_xpath(".//a[@class='favor-pos']/p/span").text
except:
# 价格标签发生改变
try:
price = home.find_element_by_xpath(".//p[@class='favor-tag around-price']/span").text
except:
price = 0
price_list.append(price)
# print(home_text)
home_list = str(home_text).split('\n')
# 去除无关数据
try:
home_list.remove('微信扫码拨号')
except:
pass
name_list.append(home_list[0])
place_list.append(home_list[1])
# 房屋类型和面积的缺失
try:
home_mian = home_list[2].split(' ')
home_type = home_mian[1]
area_list_s = home_mian[2].split(':')
area = area_list_s[1]
home_type_list.append(home_type)
area_list.append(area)
except:
home_type_list.append('--未知--')
area_list.append('--未知--')
state_list.append(home_list[3])
particular = '|'.join(home_list[4:-1])
particular_list.append(particular)
print(f'名字:{home_list[0]}, 地点:{home_list[1]},价格:{price},户型:{home_type},'
f'建筑面积:{area}. 状态{home_list[3]}, 特点{particular}')
time.sleep(0.1)
if __name__ == '__main__':
name_list = []
place_list = []
home_type_list = []
area_list = []
price_list = []
state_list = []
particular_list = []
driver = webdriver.Chrome()
driver.get('https://hf.fang.anjuke.com/?from=HomePage_TopBar')
get_one_page()
for i in range(9):
driver.find_element_by_xpath('.//div[@class="pagination"]/a[last()]').click()
get_one_page()
time.sleep(5)
print(len(name_list), len(place_list), len(home_type_list), len(area_list))
dict = {
'名字': name_list,
'地点': place_list,
'价格': price_list,
'户型': home_type_list,
'建筑面积': area_list,
'状态': state_list,
'特点': particular_list
}
df = pd.DataFrame(dict)
df.to_csv('data/房产信息.csv', index=False)
time.sleep(10)
driver.quit()
这是获取的近六百条数据
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('data/房产信息.csv')
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 3, figsize=(20, 8), dpi=80) # 把画布分成一行两列
plt.xticks(rotation=270)
# ---------------------------------------------------------------------------------------------------------------
price = list(df['价格'])
while True:
try:
price.remove(0)
except:
break
distince = 2000 # 每组的间距
ax[0].hist(price, (max(price) - min(price)) // distince)
ax[0].set_xticks(range(min(price), max(price) + 2000, distince))
ax[0].tick_params(axis='x', labelrotation=270)
ax[0].grid(linestyle='--', alpha=0.5)
ax[0].set_xlabel('房价')
ax[0].set_ylabel('单位房价数量')
ax[0].set_title("房价分布")
# ---------------------------------------------------------------------------------------------------------------
# 房子类型 饼状图
home_type_list = list(df['户型'])
type_count = []
# 去除未知数据
while True:
try:
home_type_list.remove('--未知--')
except:
break
for i in range(len(home_type_list)):
# 合并部分数据
print(len(str(home_type_list[i])))
if len(str(home_type_list[i])) > 3:
home_type_list[i] = '其他'
for home in set(home_type_list):
type_count.append(home_type_list.count(home))
ax[1].pie(type_count, labels=set(home_type_list), autopct='%1.2f%%')
ax[1].legend()
ax[1].axis('equal') # 确保饼图能化成一个圆
# ---------------------------------------------------------------------------------------------------------------
# 地区房价
place = df['地点']
place_list = []
for p in place:
place_list.append(str(p).split(' ')[1])
# 将地址切割
df['地点'] = place_list
# 求地区房价 将每个地区的房价封装的字典
dict1 = {}
for i in set(place_list):
dict1[i] = []
for i in range(len(df['地点'])):
if df['价格'][i] != 0:
place_str = df['地点'][i]
dict1[place_str].append(df['价格'][i])
# 得到房价列表
price_o = []
place_o = []
for k, v in dict1.items():
price_o.append(round(sum(v) / len(v)))
place_o.append(k)
x = range(len(place_o))
ax[2].bar(x, price_o, color=['b', 'g', 'r', 'c', 'm', 'y', 'k'])
ax[2].set_xticks(x, place_o)
ax[2].set_title('合肥各地区房价对比')
ax[2].grid(linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()