【爬虫】安居客二手房小区数据
目前国内主流房产网站中【安居客】和【房天下】提供小区信息,相较而言安居客的信息更全一点。
以苏州常熟为例,小区信息导航页网址如下:
https://suzhou.anjuke.com/community/changshua/p{0}

首先通过访问导航页获取各小区详情页链接
- 点进其中一个链接可以看到详情页网站格式是这样的:https://suzhou.anjuke.com/community/view/789024
- 搜索view后面的编码可以在导航页源码中定位到记录链接的相应行
- 搜索行所属类li-row正好有25条数据,与一页上的小区数量正好对应

之后在详情页获取所需的小区名称、位置、房价、户数等信息
#!/usr/bin/env python3 # -*- encoding: utf-8 -*- ''' @File : 2-安居客二手房小区.py @Time : 2022/06/23 23:01:08 @Author : YeCheng @Version : 1.0 @Contact : YeCheng_0728@163.com @License : (C)Copyright 2021-2022, Liugroup-NLPR-CASIA @Desc : None @Variate : ''' # here put the import lib from audioop import add from math import ceil import os from pydoc import text from urllib import response import pandas as pd import requests as rq import json import parsel from shapely.geometry import Polygon import geopandas as gpd from fake_useragent import UserAgent as fua import time output_path = r'常熟二手房小区.csv' log_path = r'pause.txt' cookie='aQQ_ajkguid=B76E9B6F-0513-B5FF-50D0-975B1EF0A078; id58=CpQDSGK6ZrU4/zZDcVHZAg==; 58tj_uuid=617316e7-8344-4c68-8d32-7768ffb101bb; als=0; _ga=GA1.2.1292231950.1656383164; ajk-appVersion=; ctid=19; sessid=01610631-6594-3834-D6E7-C6BEF387C023; twe=2; _gid=GA1.2.1523024107.1656903013; wmda_uuid=ef475760c168cf6c0f937705d3556b37; wmda_new_uuid=1; wmda_visited_projects=%3B6289197098934; lps=https%3A%2F%2Fsu.zu.anjuke.com%2Ffangyuan%2Fchangshua%2F%3Ffrom%3DSearchBar%7Chttps%3A%2F%2Fsuzhou.anjuke.com%2F; cmctid=5; fzq_h=577c79415232b90bb42012eab6c617db_1656903012492_b22187011b6a496ba72c819ce682153f_3550630220; obtain_by=2; init_refer=https%253A%252F%252Fwww.anjuke.com%252F; new_uv=4; new_session=1; fzq_js_anjuke_xiaoqu_pc=fe800f4514a9028e7625ea233b4e33de_1656919959566_24; xxzl_cid=84fa2e27735d4a9cb7ab65e23a6d232b; xzuid=95d7fca9-4551-46aa-81d2-eacd4844e9bd' user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36' proxies = {'http':'http://123.57.140.217','https':'https://49.7.19.74','https':'https://218.7.171.91','https':'https://121.40.169.87'}#等时圈参数设置 fuas = fua() headers = { 'Connection': 'close', 'cookie':cookie, 'User-Agent':str(fua().random)} cookie2='SECKEY_ABVK=mVIEINJCgTyiOsDB4cHJgETYvkI5kfX7xsPr1gAWDFc%3D; BMAP_SECKEY=3GCuE6Pr8IM8LvcOOD4FgpsxMjYnvaaSiTAPe80z8w-R2VS8nhtkM5wPamEOQdWhz8GJKsx8LC9Ee6OvB7C0c6LlGQxg0opM7Wt0B-CMmryBmCrPuuWkONbK7KU17XJZc4-SMjm2wbL7utVPQ2FqR03z_JRLo5PtMWVjlEhenfbkGlG12GTU-R3VkRky5ehi; aQQ_ajkguid=B76E9B6F-0513-B5FF-50D0-975B1EF0A078; id58=CpQDSGK6ZrU4/zZDcVHZAg==; 58tj_uuid=617316e7-8344-4c68-8d32-7768ffb101bb; als=0; _ga=GA1.2.1292231950.1656383164; ajk-appVersion=; ctid=19; sessid=01610631-6594-3834-D6E7-C6BEF387C023; twe=2; _gid=GA1.2.1523024107.1656903013; wmda_uuid=ef475760c168cf6c0f937705d3556b37; wmda_new_uuid=1; wmda_visited_projects=%3B6289197098934; lps=https%3A%2F%2Fsu.zu.anjuke.com%2Ffangyuan%2Fchangshua%2F%3Ffrom%3DSearchBar%7Chttps%3A%2F%2Fsuzhou.anjuke.com%2F; cmctid=5; fzq_h=577c79415232b90bb42012eab6c617db_1656903012492_b22187011b6a496ba72c819ce682153f_3550630220; new_uv=3; obtain_by=2; fzq_js_anjuke_xiaoqu_pc=bafdf8e7bbc1466795d834018c7cf58c_1656919390530_24; xxzl_cid=84fa2e27735d4a9cb7ab65e23a6d232b; xzuid=95d7fca9-4551-46aa-81d2-eacd4844e9bd' header2={ 'Connection': 'close', 'cookie':cookie2, 'User-Agent':str(fua().random)} num=1393 ### ------------------------------------------------------------------------------------------ s = rq.session() s.keep_alive = False #设置log文件 if os.path.exists(log_path)==False: f1 = open(log_path,'w') f1.write('0') f1.close() start = 1 else: f1 = open(log_path,'r') start = int(f1.read()) f1.close() num_page=ceil(num/25) table=pd.DataFrame(columns=['name','address','price','总户数','链接','容积率','竣工时间','总建面积','物业类型']) for p in range(start,num_page): url_cur = 'https://suzhou.anjuke.com/community/changshua/p{0}/#'.format(p) #<Response [200]> 表示请求成功 html_data = rq.get(url_cur,headers=headers,timeout=3).text #网页源码解析 sel=parsel.Selector(text=html_data) links=sel.css('.li-row::attr(href)').getall() for link in links: res_text = rq.get(link,headers=header2,timeout=3).text sel=parsel.Selector(text=res_text) name=sel.css('.title').xpath('.//text()').get() address=sel.css('.sub-title').xpath('.//text()').get().strip() price=sel.css('.average').xpath('.//text()').get().strip() num=sel.css('.value_4').xpath('.//text()').get().strip() ratio=sel.css('.value_6').xpath('.//text()').get().strip() atime=sel.css('.value_2').xpath('.//text()').get().strip() area=sel.css('.value_5').xpath('.//text()').get().strip() type=sel.css('.value_0').xpath('.//text()').get().strip() record=[name,address,price,num,link,ratio,atime,area,type] print(record) table.loc[len(table.index)]=record table.to_csv('test.csv',encoding='utf-8') print("第{0}页小区信息抓取完毕".format(p))
参考资料:
https://www.bilibili.com/video/BV1VL4y1378e?p=1

浙公网安备 33010602011771号