对于北京市租房信息数据采集分析

在如今的时代,疯狂上涨的房价让年轻人买房愈发困难,因此租房成为年轻人更优先的选择,这次数据分析整合了大多数年轻人向往的首都北京的租房价格现状,让年轻人在选择时更得心应手。

 

 

1.通过用户数目极多的租房网站(本次选择信息来源网站为:58同城租房https://bj.58.com/zufang/)获取出租房屋的各种信息

 

 

 

 

 

三.数据分析步骤

1.获取出租房屋的数据(来源https://bj.58.com/zufang/

 

#导入必要的库

# Data Manipulation
import numpy as np
import pandas as pd

#Visualization
import matplotlib. pyplot as plt
import missingno
import seaborn as sns
from pandas.tools.plotting import scatter_matrix
from mpl_toolkits.mplot3d import Axes3D

#Feature selection and Encoding
from sklearn. feature_selection import RFE,RFECV
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn. preprocessing import OneHotEncoder,LabelEncoder,label_binarize

#Machine learning
import sklearn. ensemble as ske
from sklearn import datasets,model_selection,tree,preprocessing,metrics,linear_model
from sklearn.svm import LinearSVC
from sklearn. ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn. neighbors import KNeighborsClassifier
from sklearn. naive bayes import GaussianNB



import requests,json,csv
for page in range(54):
    url='https://gongyu.58.com/guide/api_for_renting?displayLimitNum=15&basequery=room:j|cityId:1|areaId:1|cateId:8&cookie=e87rZl4Z2EYLG6ynBNNEAg==&pageNum={0}&_=1603797279731'.format(page)
    response=requests.get(url)
    response.encoding='utf-8'
    data=json.loads(response.text)['data']
    data=data['position1']['list'][1:]+data['position2']['list'][1:]+data['position3']['list'][1:]+data['position4']['list'][1:]
    for i in data:
        name=i['title'].replace(' ','')
        layout = i['layout']
        area = i['rentRoomArea']
        infor=i['dispLocal']
        money=i['price']
        print(name)
        print(layout)
        print(area)
        print(infor)
        print(money)
        result = [name, layout, area, infor, money]
        with open('租房数据.csv', 'a+',newline='', encoding='gb18030') as f:
            f_csv = csv.writer(f)
            f_csv.writerow(result)

北京各区出租房屋的信息已经有了,为了能对北京目前的租房市场有更直观的认识,对数据进行分析并可视化展示

#分析各行政区房源数量及单价
import pandas as pd
beijing_daname=['朝阳区', '丰台区', '海淀区', '大兴区', '通州区', '昌平区', '东城区', '西城区', '顺义区']
data=pd.read_csv('北京租房数据.csv',encoding='gbk')
areas=list(set(list(data['行政区'])))
area_sums={}
for area in areas:
    area_sums[area]=list(data['行政区']).count(area)
from pyecharts import options as opts
from pyecharts.charts import Bar
import random
hotel_num=[area_sums[i] for i in beijing_daname]
bar = (
    Bar()
    .add_xaxis(beijing_daname)
    .add_yaxis("", hotel_num)
    .set_global_opts(title_opts=opts.TitleOpts(title="北京各区出租房屋数量"))
    .set_series_opts(
        label_opts=opts.LabelOpts(is_show=True),
        markline_opts=opts.MarkLineOpts(
            data=[
                opts.MarkLineItem(type_="min", name="最小值"),
                opts.MarkLineItem(type_="max", name="最大值"),
                opts.MarkLineItem(type_="average", name="平均值"),
            ]
        ),
    )
)
bar.render_notebook()

得到的统计图如下

由此可见,北京市内朝阳区的房源数量最多,有1877套;顺义区的房源数量最少,只有272套;9个行政区,每个区平均房源数为611套。

 

#各行政区出租房屋单价
unit_price={} for i in list(data.groupby('行政区')): if i[0] in beijing_daname: unit_price[i[0]]=int(i[1]['价格'].sum()/i[1]['面积'].sum())*50 unit_price bar = ( Bar() .add_xaxis(list(unit_price.keys())) .add_yaxis("", [unit_price[i] for i in list(unit_price.keys())]) .set_global_opts(title_opts=opts.TitleOpts(title="北京市各行政区出租房屋单价(每平米单价*50平米为例)")) .set_series_opts( label_opts=opts.LabelOpts(is_show=True), markline_opts=opts.MarkLineOpts( data=[ opts.MarkLineItem(type_="min", name="最小值"), opts.MarkLineItem(type_="max", name="最大值"), opts.MarkLineItem(type_="average", name="平均值"), ] ), ) ) bar.render_notebook()

得出如下结果

以租房面积为五十平方米为例:

北京市西城区的住房价格最高,为4350元/间;通州区的租房价格最低,为1620元/间,由此可见不同地域位置的价格差距很大。

#分析各户型占比及价格分布
#本次分析户型所列举的为['1室1厅1卫', '2室1厅1卫', '1室0厅1卫', '3室2厅2卫', '2室2厅1卫', '3室1厅1卫', '2室2厅2卫', '2室1厅2卫', '3室1厅2卫', '1室2厅2卫']十种户型
layouts=list(set(data['户型'])) layout=data.loc[:,'户型'].value_counts() from pyecharts import options as opts from pyecharts.charts import Pie print(list(layout.index)[:10]) values=[int(i) for i in list(layout.values)[:10]] pie = ( Pie() .add( "", [(i,j)for i,j in zip(list(layout.index)[:10],values)], radius=["30%", "75%"], center=["40%", "50%"], rosetype="radius", label_opts=opts.LabelOpts(is_show=False), ) .set_global_opts( title_opts=opts.TitleOpts(title="北京市各区出租房屋户型占比"), legend_opts=opts.LegendOpts(type_="scroll", pos_left="85%", orient="vertical"),) .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c},{d}%")) ) pie.render_notebook()

分析结果如下



本次统计数据分析可以看出,出租房屋的主流是1室1厅1卫,占比41.86%;,其次分别是是2室1厅1卫、1室0厅1卫,各占比30.58%和11.02%,因此可以看出,小户型在出租房屋中更占优势、更受租客的欢迎。

#分析北京市各区出租房屋户型占比以及价格
cut_n=list(range(0,12000,1000)) income=pd.cut(data["价格"],cut_n) price_cut=data['价格'].groupby(income).count() index=list(price_cut.index) index=[str(i) for i in list(price_cut.index)] values=[int(i) for i in list(price_cut.values)] pie = ( Pie() .add( "", [(i,j)for i,j in zip(index,values)], radius=["30%", "75%"], center=["40%", "50%"], rosetype="radius", label_opts=opts.LabelOpts(is_show=False), ) .set_global_opts( title_opts=opts.TitleOpts(title="北京市各区出租房屋户型占比"), legend_opts=opts.LegendOpts(type_="scroll", pos_left="85%", orient="vertical"),) .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}, {d}%")) ) pie.render_notebook()

 

目前,大部分北京租客的租房价格在3000至6000元,占比50%左右,最便宜有1000元以下的,但位置相对较偏,且面积在20平以内;贵的有1万多的,这种一般面积在100平以上,位置在主城区,但选择此类房屋的租客极少,更多人还是选择了性价比更高的小户型。

 
#开始前导入必要的库
# Data Manipulation
import numpy as np import pandas as pd

#Visualization
import matplotlib. pyplot as plt import missingno import seaborn as sns from pandas.tools.plotting import scatter_matrix from mpl_toolkits.mplot3d import Axes3D


#Feature selection and Encoding
from sklearn. feature_selection import RFE,RFECV from sklearn.svm import SVR from sklearn.decomposition import PCA from sklearn. preprocessing import OneHotEncoder,LabelEncoder,label_binarize

#Machine learning import sklearn. ensemble as ske from sklearn import datasets,model_selection,tree,preprocessing,metrics,linear_model from sklearn.svm import LinearSVC from sklearn. ensemble import RandomForestClassifier,GradientBoostingClassifier from sklearn. neighbors import KNeighborsClassifier from sklearn. naive bayes import GaussianNB import requests,json,csv #分析网页,获取原始数据 #通过抓包,获取数据接口,通过pageNum参数控制页码,共有54页数据,返回json格式数据 for page in range(54): url='https://gongyu.58.com/guide/api_for_renting?displayLimitNum=15&basequery=room:j|cityId:1|areaId:1|cateId:8&cookie=e87rZl4Z2EYLG6ynBNNEAg==&pageNum={0}&_=1603797279731'.format(page) response=requests.get(url) response.encoding='utf-8' data=json.loads(response.text)['data'] data=data['position1']['list'][1:]+data['position2']['list'][1:]+data['position3']['list'][1:]+data['position4']['list'][1:] for i in data: name=i['title'].replace(' ','') layout = i['layout'] area = i['rentRoomArea'] infor=i['dispLocal'] money=i['price'] print(name) print(layout) print(area) print(infor) print(money) result = [name, layout, area, infor, money] #存入csv表格中准备后续使用 with open('租房数据.csv', 'a+',newline='', encoding='gb18030') as f: f_csv = csv.writer(f) f_csv.writerow(result) #分析各行政区房源数量及单价 import pandas as pd
beijing_daname
=['朝阳区', '丰台区', '海淀区', '大兴区', '通州区', '昌平区', '东城区', '西城区', '顺义区'] data=pd.read_csv('北京租房数据.csv',encoding='gbk') areas=list(set(list(data['行政区']))) area_sums={}
for area in areas: area_sums[area]=list(data['行政区']).count(area)
from pyecharts import options as opts from pyecharts.charts import Bar import random
hotel_num
=[area_sums[i] for i in beijing_daname]
bar
= ( Bar() .add_xaxis(beijing_daname) .add_yaxis("", hotel_num) .set_global_opts(title_opts=opts.TitleOpts(title="北京各区出租房屋数量")) .set_series_opts( label_opts=opts.LabelOpts(is_show=True), markline_opts=opts.MarkLineOpts( data=[ opts.MarkLineItem(type_="min", name="最小值"), opts.MarkLineItem(type_="max", name="最大值"), opts.MarkLineItem(type_="average", name="平均值"), ] ), ) )
bar.render_notebook()
#各行政区出租房屋单价 unit_price={}
for i in list(data.groupby('行政区')): if i[0] in beijing_daname: unit_price[i[0]]=int(i[1]['价格'].sum()/i[1]['面积'].sum())*50 unit_price
bar
= ( Bar() .add_xaxis(list(unit_price.keys())) .add_yaxis("", [unit_price[i] for i in list(unit_price.keys())]) .set_global_opts(title_opts=opts.TitleOpts(title="北京各区出租房屋均价(每平米单价*50平米为例)")) .set_series_opts( label_opts=opts.LabelOpts(is_show=True), markline_opts=opts.MarkLineOpts( data=[ opts.MarkLineItem(type_="min", name="最小值"), opts.MarkLineItem(type_="max", name="最大值"), opts.MarkLineItem(type_="average", name="平均值"), ] ), ) )
bar.render_notebook()
#分析各户型占比及价格分布 #本次分析户型所列举的为: #['1室1厅1卫', '2室1厅1卫', '1室0厅1卫', '3室2厅2卫', '2室2厅1卫', '3室1厅1卫', '2室2厅2卫', '2室1厅2卫', '3室1厅2卫', '1室2厅2卫']十种户型 layouts=list(set(data['户型'])) layout=data.loc[:,'户型'].value_counts()
from pyecharts import options as opts from pyecharts.charts import Pie print(list(layout.index)[:10])
values
=[int(i) for i in list(layout.values)[:10]]
pie
= ( Pie() .add( "", [(i,j)for i,j in zip(list(layout.index)[:10],values)], radius=["30%", "75%"], center=["40%", "50%"], rosetype="radius", label_opts=opts.LabelOpts(is_show=False), )
.set_global_opts( title_opts
=opts.TitleOpts(title="北京市各区出租房屋户型占比"), legend_opts=opts.LegendOpts(type_="scroll", pos_left="85%", orient="vertical"),) .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c},{d}%")) )
pie.render_notebook()
#分析北京市各区出租房屋户型占比以及价格 cut_n=list(range(0,12000,1000))
income
=pd.cut(data["价格"],cut_n) price_cut=data['价格'].groupby(income).count()
index
=list(price_cut.index)
index
=[str(i) for i in list(price_cut.index)]
values
=[int(i) for i in list(price_cut.values)] pie = ( Pie() .add( "", [(i,j)for i,j in zip(index,values)], radius=["30%", "75%"], center=["40%", "50%"], rosetype="radius", label_opts=opts.LabelOpts(is_show=False), )
.set_global_opts( title_opts
=opts.TitleOpts(title="北京市各区出租房屋户型占比"), legend_opts=opts.LegendOpts(type_="scroll", pos_left="85%", orient="vertical"),)
.set_series_opts(label_opts
=opts.LabelOpts(formatter="{b}: {c}, {d}%")) )
pie.render_notebook()

出,北京市不仅有着高昂的房价,租房价格也非常高,当代年轻人肩上所担负的压力非常大。

数据的采集与分析可以让人对繁杂的数据有非常直观的感受,但是本次采集的数据还只是一小部分,总的来说信息数据采集范围不够广泛,样本不够全面,我还会继续加深学习。




 

posted @ 2021-06-21 19:48  TKn  阅读(432)  评论(0编辑  收藏  举报