Python大连房价信息爬取与可视化

一、摘要:

  本项目应用Python网络爬虫技术,通过爬取贝壳网的房价信息通过可视化查看各个区域每个时间段的房价信息。

二、选题背景:

  在现在经济生活中,房地产市场成为金融、建筑和股市等经济板块发展的重要推动力,其产业的兴衰影响着国民经济的发展状态,商品房价格是房地产市场的最重要和最直接的反映,是商品房价值的货币表现. 
在我国目前这个特定的紧急发展阶段,商品房市场作为城市经济体系的一个重要组成部分,其发展演变在很大程度上取决与城市的经济状况、城市的发展目标以及城市的相关政策和管制措施.
我国的住房问题更多的是与其他大量微观的和宏观的经济和社会问题联系在一起.因此,我国经济高速发展、农村剩余劳动力向城市快速转移、住房产业逐步成为城市经济支柱产业的背景下发展。

三、数据说明:

  爬取贝壳网房价数据和房屋户型信息,对房价信息进行动态可视化,并对户型信息进行词云绘制。

四、实施过程及代码:

导入库

1 import requests
2 import random
3 import time
4 from bs4 import BeautifulSoup
5 import openpyxl
6 from openpyxl import load_workbook
7 from openpyxl import Workbook
8 import matplotlib.pyplot as plt

构造爬虫程序,下面演示了爬取甘井子地区前九页房价信息的爬虫,剩下区域可根据相同方法爬取

 1 user_agent_list = [
 2     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
 3     "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
 4     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
 5     "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
 6     "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
 7     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
 8     "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
 9     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3"
10 ]#伪装电脑
11 UA = random.choice(user_agent_list)  ##从self.user_agent_list中随机取出一个字符串
12 headers = {'User-Agent': UA}  ##构造成一个完整的User-Agent (UA代表的是上面随机取出来的字符串哦)
13 name = 'ganjingzi'#区域名拼写
14 url = 'https://dl.ke.com/chengjiao/{}/pg'.format(name)#构造URL
15 all_message = []#用于存放所有爬取的信息
16 for x in range(1,10):#页数
17     print('{}/10'.format(x))
18  
19     time.sleep(random.randint(2,5)) 
20     print(url+str(x)+str(x))
21  
22     with requests.get(url+str(x), headers=headers, timeout=5) as response:
23  
24         soup = BeautifulSoup(response.text, 'lxml')
25  
26         # print(soup.title.text)  # 打印title
27  
28         li_list = soup.find('ul', class_='listContent').find_all('li')
29         # print(li_list)
30         for li_quick in li_list:
31  
32             try:
33                 #取成交日期
34                 dealDate=li_quick.find('div',class_='dealDate').get_text().strip()
35                 #取名称
36                 title=li_quick.find('div',class_='title').a.get_text().strip()
37                 #取总价
38                 totalPrice=li_quick.find('div',class_='totalPrice').find('span',class_='number').get_text().strip()
39                 #取单价
40                 unitPrice=li_quick.find('div',class_='unitPrice').find('span',class_='number').get_text().strip()
41                 #取位置信息
42                 positionInfo=li_quick.find('div',class_='positionInfo').get_text().strip()
43                 #取成交周期
44                 dealCycleTxt0=li_quick.find('div',class_='dealCycleeInfo').find('span',class_='dealCycleTxt').find_all('span')[0].get_text().strip()
45                 #取挂牌价格
46                 dealCycleTxt1=li_quick.find('div',class_='dealCycleeInfo').find('span',class_='dealCycleTxt').find_all('span')[1].get_text().strip()
47                 all_message.append([title,dealDate,totalPrice,unitPrice,dealCycleTxt0,dealCycleTxt1,positionInfo])
48             except:
49                 continue
50             finally:
51                 print(title,',',dealDate,',',totalPrice,',',unitPrice,',',dealCycleTxt0,',',dealCycleTxt1,',',positionInfo)

保存爬取到的房价信息,将各个区域的数据都按照此方法爬取并保存

1 wb3 = Workbook()
2 sheet3 = wb3.active
3 for i in range(len(all_message)):
4     sheet3.append(all_message[i])
5 wb3.save(r'D:\program\黑龙江房价\{}.xlsx'.format(name))

导入数据统计和可视化用到的库

 1 import collections
 2 from pyecharts import options as opts
 3 # from pyecharts import options as opts
 4 from pyecharts.charts import Bar, Page, Pie, Timeline
 5 from random import randrange
 6 import openpyxl
 7 from openpyxl import load_workbook
 8 from openpyxl import Workbook
 9 from collections import Counter
10 import random

读取爬取的数据,将数据根据日期进行排序并去重

 1 def data_s(name):
 2     all_message = []
 3     wb3 = load_workbook(r'D:\program\黑龙江房价\{}.xlsx'.format(name))
 4     sheet3 = wb3.get_sheet_by_name('Sheet')
 5     rows = sheet3.max_row
 6     columns = sheet3.max_column
 7     for x in range(1,rows+1):
 8         tem = []
 9         for y in range(1,columns+1):
10             tem.append(sheet3.cell(x,y).value)
11         all_message.append(tem)
12 
13     # print(all_message)
14     sorted(all_message,key=(lambda x:x[1]))#根据日期排序
15     # print(all_message)
16     final = []
17     for i in all_message:#数据清洗,去重
18         if i not in final:
19             final.append(i)
20     wb4 = Workbook()
21     sheet4 = wb4.active
22     for i in range(len(final)):
23         sheet4.append(final[i])
24     wb4.save(r'D:\program\黑龙江房价\{}_final.xlsx'.format(name))
25     return final
1 data_s('高新园区')#显示高新园区排序和清洗后的数据

 

 

 构造统计函数,对处理后的数据进行统计,以便画图

 1 def get_xy(name):#duza
 2     gx = data_s(name)
 3     date = []
 4     for i in gx:
 5         # print(i)
 6         date.append(i[1][:7])
 7     # print(date)
 8     values_counts = Counter(date)
 9 #     print(values_counts)
10 #     print(len(values_counts))
11     sorted(values_counts.keys())
12     print(values_counts)
13     x = []
14     y = []
15     for i in values_counts:
16         x.append(i)
17     x.sort()
18     for i in x:
19         sum = 0
20         for each in gx:
21             if i in each[1]:
22                 sum+=int(each[3])
23         print(sum)
24         print(sum/values_counts[i])
25         y.append(int(sum/values_counts[i]))
26     return x,y

画各个区域每平方房价变化的动态柱状图

 1 city = ['高新园区','甘井子','金州','开发区','旅顺口','沙河口','普兰店','瓦房店','西岗','中山']
 2 # city = ['高新园区']
 3 # x,y = get_xy('gaoxinyuanqu')
 4 corlo = ['blue','red','purple','green','pink','orange','scarlet']
 5 def timeline_bar() -> Timeline:
 6     # x = ['2020.12', '2021.01', '2021.02', '2021.03', '2021.04', '2021.05']
 7     tl = Timeline()
 8     for i in city:
 9         a,y = get_xy(i)
10         x = a
11         bar = Bar()  
12         bar.add_xaxis(x)
13         bar.add_yaxis(' ',y)
14         bar.set_global_opts(title_opts=opts.TitleOpts("大连{}平均房价走势图".format(i)))
15         tl.add(bar, "{}".format(i))
16     tl.render(r'D:\program\黑龙江房价\image\各个区域每平方房价变化的动态柱状图.html')
17     tl.render_notebook()
18     return tl
1 timeline_bar()

 各个区域的每平方价格

 1 city = ['高新园区','甘井子','金州','开发区','旅顺口','沙河口','普兰店','瓦房店','西岗','中山']
 2 # city = ['高新园区']
 3 # x,y = get_xy('gaoxinyuanqu')
 4 corlo = ['blue','red','purple','green','pink','orange','scarlet']
 5 # x = ['2020.12', '2021.01', '2021.02', '2021.03', '2021.04', '2021.05']
 6 def bar(name):
 7     a,y = get_xy(name)
 8     x = a
 9     bar = Bar()  
10     bar = Bar()  
11     bar.add_xaxis(x)
12     bar.add_yaxis(' ',y)
13     bar.set_global_opts(title_opts=opts.TitleOpts("{}平均房价柱状图".format(name)))
14     bar.render_notebook()
15     bar.render(r'D:\program\黑龙江房价\image\{}平均房价柱状图.html'.format(name))
16 #     bar.add('{}房价'.format(name), x, y, is_label_show=True, is_datazoom_show=True, datazoom_type='both') 
17     return bar
1 bar(city[0])

1 bar(city[1])

1 bar(city[2])

1 bar(city[3])

1 bar(city[4])

1 bar(city[6])

1 bar(city[5])

1 bar(city[7])

1 bar(city[8])

1 bar(city[9])

 导入绘制词云图需要的库

 1 import time
 2 import openpyxl
 3 from openpyxl import load_workbook
 4 from openpyxl import Workbook
 5 import random
 6 import re
 7 import datetime
 8 # 导入词云制作库wordcloud和中文分词库jieba
 9 import jieba
10 import wordcloud
11 from wordcloud import WordCloud,ImageColorGenerator

构造生成词云的函数

 1 def cy(name):
 2     # 构建并配置词云对象w,注意要加scale参数,提高清晰度
 3     w = wordcloud.WordCloud(width=1000,
 4                             height=700,
 5                             background_color='white',
 6                             font_path='msyh.ttc',
 7                             scale=15)
 8     txt = []
 9     wb1 = load_workbook(r'D:\program\黑龙江房价\{}_final.xlsx'.format(name))
10     sheet1 = wb1.get_sheet_by_name('Sheet')
11     rows = sheet1.max_row
12     columns = sheet1.max_column
13     for x in range(1,rows):
14         txt.append(sheet1.cell(x,1).value)
15     # txtlist = jieba.lcut(txt)
16     txtlist = txt
17     print('{}的词云图'.format(name))
18     string = " ".join(txtlist)
19     # 将string变量传入w的generate()方法,给词云输入文字
20     w.generate(string)
21     # 将词云图片导出到当前文件夹
22     w.to_file(r'D:\program\黑龙江房价\{}.png'.format(name))
23     plt.figure(name)
24     # 以图片的形式显示词云
25     plt.imshow(w)
26     # 关闭图像坐标系
27     plt.axis("off")
28     plt.show()

画出每个区域的词云图,词云图是根据房型和位置名进行绘制的

1 city = ['高新园区','甘井子','金州','开发区','旅顺口','沙河口','普兰店','瓦房店','西岗','中山']
2 for i in city:
3     cy(i)
高新园区的词云图
甘井子的词云图
金州的词云图
开发区的词云图
旅顺口的词云图
沙河口的词云图
普兰店的词云图
瓦房店的词云图
西岗的词云图
中山的词云图

完整代码:

  1 import requests
  2 import random
  3 import time
  4 from bs4 import BeautifulSoup
  5 import openpyxl
  6 from openpyxl import load_workbook
  7 from openpyxl import Workbook
  8 import matplotlib.pyplot as plt
  9 
 10 user_agent_list = [
 11     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
 12     "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
 13     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
 14     "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
 15     "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
 16     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
 17     "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
 18     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3"
 19 ]#伪装电脑
 20 UA = random.choice(user_agent_list)  ##从self.user_agent_list中随机取出一个字符串
 21 headers = {'User-Agent': UA}  ##构造成一个完整的User-Agent (UA代表的是上面随机取出来的字符串哦)
 22 name = 'ganjingzi'#区域名拼写
 23 url = 'https://dl.ke.com/chengjiao/{}/pg'.format(name)#构造URL
 24 all_message = []#用于存放所有爬取的信息
 25 for x in range(1,10):#页数
 26     print('{}/10'.format(x))
 27  
 28     time.sleep(random.randint(2,5)) 
 29     print(url+str(x)+str(x))
 30  
 31     with requests.get(url+str(x), headers=headers, timeout=5) as response:
 32  
 33         soup = BeautifulSoup(response.text, 'lxml')
 34  
 35         # print(soup.title.text)  # 打印title
 36  
 37         li_list = soup.find('ul', class_='listContent').find_all('li')
 38         # print(li_list)
 39         for li_quick in li_list:
 40  
 41             try:
 42                 #取成交日期
 43                 dealDate=li_quick.find('div',class_='dealDate').get_text().strip()
 44                 #取名称
 45                 title=li_quick.find('div',class_='title').a.get_text().strip()
 46                 #取总价
 47                 totalPrice=li_quick.find('div',class_='totalPrice').find('span',class_='number').get_text().strip()
 48                 #取单价
 49                 unitPrice=li_quick.find('div',class_='unitPrice').find('span',class_='number').get_text().strip()
 50                 #取位置信息
 51                 positionInfo=li_quick.find('div',class_='positionInfo').get_text().strip()
 52                 #取成交周期
 53                 dealCycleTxt0=li_quick.find('div',class_='dealCycleeInfo').find('span',class_='dealCycleTxt').find_all('span')[0].get_text().strip()
 54                 #取挂牌价格
 55                 dealCycleTxt1=li_quick.find('div',class_='dealCycleeInfo').find('span',class_='dealCycleTxt').find_all('span')[1].get_text().strip()
 56                 all_message.append([title,dealDate,totalPrice,unitPrice,dealCycleTxt0,dealCycleTxt1,positionInfo])
 57             except:
 58                 continue
 59             finally:
 60                 print(title,',',dealDate,',',totalPrice,',',unitPrice,',',dealCycleTxt0,',',dealCycleTxt1,',',positionInfo)
 61 
 62 wb3 = Workbook()
 63 sheet3 = wb3.active
 64 for i in range(len(all_message)):
 65     sheet3.append(all_message[i])
 66 wb3.save(r'D:\program\黑龙江房价\{}.xlsx'.format(name))
 67 
 68 import collections
 69 from pyecharts import options as opts
 70 # from pyecharts import options as opts
 71 from pyecharts.charts import Bar, Page, Pie, Timeline
 72 from random import randrange
 73 import openpyxl
 74 from openpyxl import load_workbook
 75 from openpyxl import Workbook
 76 from collections import Counter
 77 import random
 78 
 79 def data_s(name):
 80     all_message = []
 81     wb3 = load_workbook(r'D:\program\黑龙江房价\{}.xlsx'.format(name))
 82     sheet3 = wb3.get_sheet_by_name('Sheet')
 83     rows = sheet3.max_row
 84     columns = sheet3.max_column
 85     for x in range(1,rows+1):
 86         tem = []
 87         for y in range(1,columns+1):
 88             tem.append(sheet3.cell(x,y).value)
 89         all_message.append(tem)
 90 
 91     # print(all_message)
 92     sorted(all_message,key=(lambda x:x[1]))#根据日期排序
 93     # print(all_message)
 94     final = []
 95     for i in all_message:#数据清洗,去重
 96         if i not in final:
 97             final.append(i)
 98     wb4 = Workbook()
 99     sheet4 = wb4.active
100     for i in range(len(final)):
101         sheet4.append(final[i])
102     wb4.save(r'D:\program\黑龙江房价\{}_final.xlsx'.format(name))
103     return final
104 
105 data_s('高新园区')#显示高新园区排序和清洗后的数据
106 
107 def get_xy(name):#duza
108     gx = data_s(name)
109     date = []
110     for i in gx:
111         # print(i)
112         date.append(i[1][:7])
113     # print(date)
114     values_counts = Counter(date)
115 #     print(values_counts)
116 #     print(len(values_counts))
117     sorted(values_counts.keys())
118     print(values_counts)
119     x = []
120     y = []
121     for i in values_counts:
122         x.append(i)
123     x.sort()
124     for i in x:
125         sum = 0
126         for each in gx:
127             if i in each[1]:
128                 sum+=int(each[3])
129         print(sum)
130         print(sum/values_counts[i])
131         y.append(int(sum/values_counts[i]))
132     return x,y
133 
134 city = ['高新园区','甘井子','金州','开发区','旅顺口','沙河口','普兰店','瓦房店','西岗','中山']
135 # city = ['高新园区']
136 # x,y = get_xy('gaoxinyuanqu')
137 corlo = ['blue','red','purple','green','pink','orange','scarlet']
138 def timeline_bar() -> Timeline:
139     # x = ['2020.12', '2021.01', '2021.02', '2021.03', '2021.04', '2021.05']
140     tl = Timeline()
141     for i in city:
142         a,y = get_xy(i)
143         x = a
144         bar = Bar()  
145         bar.add_xaxis(x)
146         bar.add_yaxis(' ',y)
147         bar.set_global_opts(title_opts=opts.TitleOpts("大连{}平均房价走势图".format(i)))
148         tl.add(bar, "{}".format(i))
149     tl.render(r'D:\program\黑龙江房价\image\各个区域每平方房价变化的动态柱状图.html')
150     tl.render_notebook()
151     return tl
152 
153 timeline_bar()
154 
155 
156 city = ['高新园区','甘井子','金州','开发区','旅顺口','沙河口','普兰店','瓦房店','西岗','中山']
157 # city = ['高新园区']
158 # x,y = get_xy('gaoxinyuanqu')
159 corlo = ['blue','red','purple','green','pink','orange','scarlet']
160 # x = ['2020.12', '2021.01', '2021.02', '2021.03', '2021.04', '2021.05']
161 def bar(name):
162     a,y = get_xy(name)
163     x = a
164     bar = Bar()  
165     bar = Bar()  
166     bar.add_xaxis(x)
167     bar.add_yaxis(' ',y)
168     bar.set_global_opts(title_opts=opts.TitleOpts("{}平均房价柱状图".format(name)))
169     bar.render_notebook()
170     bar.render(r'D:\program\黑龙江房价\image\{}平均房价柱状图.html'.format(name))
171 #     bar.add('{}房价'.format(name), x, y, is_label_show=True, is_datazoom_show=True, datazoom_type='both') 
172     return bar
173 
174 
175 bar(city[0])
176 
177 bar(city[1])
178 
179 bar(city[2])
180 
181 bar(city[3])
182 
183 bar(city[4])
184 
185 bar(city[6])
186 
187 bar(city[5])
188 
189 bar(city[7])
190 
191 bar(city[8])
192 
193 bar(city[9])
194 
195 import time
196 import openpyxl
197 from openpyxl import load_workbook
198 from openpyxl import Workbook
199 import random
200 import re
201 import datetime
202 # 导入词云制作库wordcloud和中文分词库jieba
203 import jieba
204 import wordcloud
205 from wordcloud import WordCloud,ImageColorGenerator
206 
207 def cy(name):
208     # 构建并配置词云对象w,注意要加scale参数,提高清晰度
209     w = wordcloud.WordCloud(width=1000,
210                             height=700,
211                             background_color='white',
212                             font_path='msyh.ttc',
213                             scale=15)
214     txt = []
215     wb1 = load_workbook(r'D:\program\黑龙江房价\{}_final.xlsx'.format(name))
216     sheet1 = wb1.get_sheet_by_name('Sheet')
217     rows = sheet1.max_row
218     columns = sheet1.max_column
219     for x in range(1,rows):
220         txt.append(sheet1.cell(x,1).value)
221     # txtlist = jieba.lcut(txt)
222     txtlist = txt
223     print('{}的词云图'.format(name))
224     string = " ".join(txtlist)
225     # 将string变量传入w的generate()方法,给词云输入文字
226     w.generate(string)
227     # 将词云图片导出到当前文件夹
228     w.to_file(r'D:\program\黑龙江房价\{}.png'.format(name))
229     plt.figure(name)
230     # 以图片的形式显示词云
231     plt.imshow(w)
232     # 关闭图像坐标系
233     plt.axis("off")
234     plt.show()
235 
236 
237 city = ['高新园区','甘井子','金州','开发区','旅顺口','沙河口','普兰店','瓦房店','西岗','中山']
238 for i in city:
239     cy(i)

总结:

  通过该项目的可视化效果可以查看每个区域每个季度的房价信息以便根据实际价格决定房屋购买时间,还可以通过词云直观的将受欢迎的房型表示出来,供用户参考 。

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 
posted @ 2021-06-23 11:32  2003840251李胤霖  阅读(464)  评论(0编辑  收藏  举报