爬虫案例安居客信息网

作业:安居客信息网

爬取网址url https://hf.fang.anjuke.com/loupan/all/

image-20240120201040361

一页数据的爬取

	driver = webdriver.Chrome()
	driver.get('https://hf.fang.anjuke.com/?from=HomePage_TopBar')
	home_list = driver.find_elements_by_xpath("//div[@class='key-list imglazyload']/div")
    for home in home_list:
        home_text = home.find_element_by_xpath(".//div[@class='infos']").text
        # 价格的获取
        try:
            price = home.find_element_by_xpath(".//a[@class='favor-pos']/p/span").text
        except:
            #价格标签发生改变
            try:
                price = home.find_element_by_xpath(".//p[@class='favor-tag around-price']/span").text
            except:
                price = 0
        price_list.append(price)
        # print(home_text)
        home_list = str(home_text).split('\n')
        #去除无关数据
        try:
            home_list.remove('微信扫码拨号')
        except:
            pass
        name_list.append(home_list[0])
        place_list.append(home_list[1])
        #房屋类型和面积的缺失
        try:
            home_mian = home_list[2].split(' ')
            home_type = home_mian[1]
            area_list_s = home_mian[2].split(':')
            area = area_list_s[1]
            home_type_list.append(home_type)
            area_list.append(area)

        except:
            home_type_list.append('--未知--')
            area_list.append('--未知--')

        state_list.append(home_list[3])
        particular = '|'.join(home_list[4:-1])
        particular_list.append(particular)
        print(f'名字:{home_list[0]},  地点:{home_list[1]},价格:{price},户型:{home_type},'
              f'建筑面积:{area}. 状态{home_list[3]}, 特点{particular}')
        time.sleep(0.1)

获取前十页数据,将获取到的数据封装到字典中,再将数据转化为DeteFrame类型中,再将数据保存到csv文件中

from selenium import webdriver
import time
import re
import pandas as pd


def get_one_page():
    home_list = driver.find_elements_by_xpath("//div[@class='key-list imglazyload']/div")
    for home in home_list:
        home_text = home.find_element_by_xpath(".//div[@class='infos']").text
        # 价格的获取
        try:
            price = home.find_element_by_xpath(".//a[@class='favor-pos']/p/span").text
        except:
            # 价格标签发生改变
            try:
                price = home.find_element_by_xpath(".//p[@class='favor-tag around-price']/span").text
            except:
                price = 0
        price_list.append(price)
        # print(home_text)
        home_list = str(home_text).split('\n')
        # 去除无关数据
        try:
            home_list.remove('微信扫码拨号')
        except:
            pass
        name_list.append(home_list[0])
        place_list.append(home_list[1])
        # 房屋类型和面积的缺失
        try:
            home_mian = home_list[2].split(' ')
            home_type = home_mian[1]
            area_list_s = home_mian[2].split(':')
            area = area_list_s[1]
            home_type_list.append(home_type)
            area_list.append(area)

        except:
            home_type_list.append('--未知--')
            area_list.append('--未知--')

        state_list.append(home_list[3])
        particular = '|'.join(home_list[4:-1])
        particular_list.append(particular)
        print(f'名字:{home_list[0]},  地点:{home_list[1]},价格:{price},户型:{home_type},'
              f'建筑面积:{area}. 状态{home_list[3]}, 特点{particular}')
        time.sleep(0.1)


if __name__ == '__main__':
    name_list = []
    place_list = []
    home_type_list = []
    area_list = []
    price_list = []
    state_list = []
    particular_list = []
    driver = webdriver.Chrome()
    driver.get('https://hf.fang.anjuke.com/?from=HomePage_TopBar')
    get_one_page()
    for i in range(9):
        driver.find_element_by_xpath('.//div[@class="pagination"]/a[last()]').click()
        get_one_page()
        time.sleep(5)

    print(len(name_list), len(place_list), len(home_type_list), len(area_list))
    dict = {
        '名字': name_list,
        '地点': place_list,
        '价格': price_list,
        '户型': home_type_list,
        '建筑面积': area_list,
        '状态': state_list,
        '特点': particular_list
    }
    df = pd.DataFrame(dict)
    df.to_csv('data/房产信息.csv', index=False)
    time.sleep(10)
    driver.quit()

这是获取的近六百条数据

image-20240120202207017

import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('data/房产信息.csv')

plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 3, figsize=(20, 8), dpi=80)  # 把画布分成一行两列

plt.xticks(rotation=270)
# ---------------------------------------------------------------------------------------------------------------
price = list(df['价格'])
while True:
    try:
        price.remove(0)
    except:
        break
distince = 2000  # 每组的间距
ax[0].hist(price, (max(price) - min(price)) // distince)
ax[0].set_xticks(range(min(price), max(price) + 2000, distince))
ax[0].tick_params(axis='x', labelrotation=270)
ax[0].grid(linestyle='--', alpha=0.5)
ax[0].set_xlabel('房价')
ax[0].set_ylabel('单位房价数量')
ax[0].set_title("房价分布")
# ---------------------------------------------------------------------------------------------------------------
# 房子类型 饼状图

home_type_list = list(df['户型'])
type_count = []
# 去除未知数据
while True:
    try:
        home_type_list.remove('--未知--')
    except:
        break
for i in range(len(home_type_list)):
    # 合并部分数据
    print(len(str(home_type_list[i])))
    if len(str(home_type_list[i])) > 3:
        home_type_list[i] = '其他'

for home in set(home_type_list):
    type_count.append(home_type_list.count(home))

ax[1].pie(type_count, labels=set(home_type_list), autopct='%1.2f%%')
ax[1].legend()
ax[1].axis('equal')  # 确保饼图能化成一个圆
# ---------------------------------------------------------------------------------------------------------------
# 地区房价
place = df['地点']
place_list = []
for p in place:
    place_list.append(str(p).split(' ')[1])
# 将地址切割
df['地点'] = place_list
# 求地区房价 将每个地区的房价封装的字典
dict1 = {}
for i in set(place_list):
    dict1[i] = []
for i in range(len(df['地点'])):
    if df['价格'][i] != 0:
        place_str = df['地点'][i]
        dict1[place_str].append(df['价格'][i])
# 得到房价列表
price_o = []
place_o = []
for k, v in dict1.items():
    price_o.append(round(sum(v) / len(v)))
    place_o.append(k)

x = range(len(place_o))
ax[2].bar(x, price_o, color=['b', 'g', 'r', 'c', 'm', 'y', 'k'])
ax[2].set_xticks(x, place_o)
ax[2].set_title('合肥各地区房价对比')
ax[2].grid(linestyle='--', alpha=0.5)
plt.tight_layout()

plt.show()

image-20240121161812633

posted @ 2024-01-21 16:20  low-reed  阅读(23)  评论(0)    收藏  举报