【爬虫】安居客二手房小区数据

目前国内主流房产网站中【安居客】和【房天下】提供小区信息,相较而言安居客的信息更全一点。

以苏州常熟为例,小区信息导航页网址如下:

https://suzhou.anjuke.com/community/changshua/p{0}
 
首先通过访问导航页获取各小区详情页链接
  • 点进其中一个链接可以看到详情页网站格式是这样的:https://suzhou.anjuke.com/community/view/789024
  • 搜索view后面的编码可以在导航页源码中定位到记录链接的相应行
  • 搜索行所属类li-row正好有25条数据,与一页上的小区数量正好对应

之后在详情页获取所需的小区名称、位置、房价、户数等信息

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
'''
@File    :   2-安居客二手房小区.py
@Time    :   2022/06/23 23:01:08
@Author  :   YeCheng 
@Version :   1.0
@Contact :   YeCheng_0728@163.com
@License :   (C)Copyright 2021-2022, Liugroup-NLPR-CASIA
@Desc    :   None
@Variate    :  
'''
# here put the import lib

from audioop import add
from math import ceil
import os
from pydoc import text
from urllib import response
import pandas as pd
import requests as rq
import json
import parsel
from shapely.geometry import Polygon
import geopandas as gpd
from fake_useragent import UserAgent as fua
import time

output_path = r'常熟二手房小区.csv'
log_path = r'pause.txt'
cookie='aQQ_ajkguid=B76E9B6F-0513-B5FF-50D0-975B1EF0A078; id58=CpQDSGK6ZrU4/zZDcVHZAg==; 58tj_uuid=617316e7-8344-4c68-8d32-7768ffb101bb; als=0; _ga=GA1.2.1292231950.1656383164; ajk-appVersion=; ctid=19; sessid=01610631-6594-3834-D6E7-C6BEF387C023; twe=2; _gid=GA1.2.1523024107.1656903013; wmda_uuid=ef475760c168cf6c0f937705d3556b37; wmda_new_uuid=1; wmda_visited_projects=%3B6289197098934; lps=https%3A%2F%2Fsu.zu.anjuke.com%2Ffangyuan%2Fchangshua%2F%3Ffrom%3DSearchBar%7Chttps%3A%2F%2Fsuzhou.anjuke.com%2F; cmctid=5; fzq_h=577c79415232b90bb42012eab6c617db_1656903012492_b22187011b6a496ba72c819ce682153f_3550630220; obtain_by=2; init_refer=https%253A%252F%252Fwww.anjuke.com%252F; new_uv=4; new_session=1; fzq_js_anjuke_xiaoqu_pc=fe800f4514a9028e7625ea233b4e33de_1656919959566_24; xxzl_cid=84fa2e27735d4a9cb7ab65e23a6d232b; xzuid=95d7fca9-4551-46aa-81d2-eacd4844e9bd'
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
proxies = {'http':'http://123.57.140.217','https':'https://49.7.19.74','https':'https://218.7.171.91','https':'https://121.40.169.87'}#等时圈参数设置
fuas = fua()
headers = {
    'Connection': 'close',
    'cookie':cookie,
    'User-Agent':str(fua().random)}
cookie2='SECKEY_ABVK=mVIEINJCgTyiOsDB4cHJgETYvkI5kfX7xsPr1gAWDFc%3D; BMAP_SECKEY=3GCuE6Pr8IM8LvcOOD4FgpsxMjYnvaaSiTAPe80z8w-R2VS8nhtkM5wPamEOQdWhz8GJKsx8LC9Ee6OvB7C0c6LlGQxg0opM7Wt0B-CMmryBmCrPuuWkONbK7KU17XJZc4-SMjm2wbL7utVPQ2FqR03z_JRLo5PtMWVjlEhenfbkGlG12GTU-R3VkRky5ehi; aQQ_ajkguid=B76E9B6F-0513-B5FF-50D0-975B1EF0A078; id58=CpQDSGK6ZrU4/zZDcVHZAg==; 58tj_uuid=617316e7-8344-4c68-8d32-7768ffb101bb; als=0; _ga=GA1.2.1292231950.1656383164; ajk-appVersion=; ctid=19; sessid=01610631-6594-3834-D6E7-C6BEF387C023; twe=2; _gid=GA1.2.1523024107.1656903013; wmda_uuid=ef475760c168cf6c0f937705d3556b37; wmda_new_uuid=1; wmda_visited_projects=%3B6289197098934; lps=https%3A%2F%2Fsu.zu.anjuke.com%2Ffangyuan%2Fchangshua%2F%3Ffrom%3DSearchBar%7Chttps%3A%2F%2Fsuzhou.anjuke.com%2F; cmctid=5; fzq_h=577c79415232b90bb42012eab6c617db_1656903012492_b22187011b6a496ba72c819ce682153f_3550630220; new_uv=3; obtain_by=2; fzq_js_anjuke_xiaoqu_pc=bafdf8e7bbc1466795d834018c7cf58c_1656919390530_24; xxzl_cid=84fa2e27735d4a9cb7ab65e23a6d232b; xzuid=95d7fca9-4551-46aa-81d2-eacd4844e9bd'
header2={
    'Connection': 'close',
    'cookie':cookie2,
    'User-Agent':str(fua().random)}
num=1393
### ------------------------------------------------------------------------------------------
s = rq.session()
s.keep_alive = False
#设置log文件
if os.path.exists(log_path)==False:
    f1 = open(log_path,'w')
    f1.write('0')
    f1.close()
    start = 1
else:
    f1 = open(log_path,'r')
    start = int(f1.read())
    f1.close()

num_page=ceil(num/25)
table=pd.DataFrame(columns=['name','address','price','总户数','链接','容积率','竣工时间','总建面积','物业类型'])
for p in range(start,num_page):
    url_cur = 'https://suzhou.anjuke.com/community/changshua/p{0}/#'.format(p)
    #<Response [200]> 表示请求成功
    html_data = rq.get(url_cur,headers=headers,timeout=3).text
    #网页源码解析
    sel=parsel.Selector(text=html_data)
    links=sel.css('.li-row::attr(href)').getall()
    for link in links:
        res_text = rq.get(link,headers=header2,timeout=3).text
        sel=parsel.Selector(text=res_text)
        name=sel.css('.title').xpath('.//text()').get()
        address=sel.css('.sub-title').xpath('.//text()').get().strip()
        price=sel.css('.average').xpath('.//text()').get().strip()
        num=sel.css('.value_4').xpath('.//text()').get().strip()
        ratio=sel.css('.value_6').xpath('.//text()').get().strip()
        atime=sel.css('.value_2').xpath('.//text()').get().strip()
        area=sel.css('.value_5').xpath('.//text()').get().strip()
        type=sel.css('.value_0').xpath('.//text()').get().strip()
        record=[name,address,price,num,link,ratio,atime,area,type]
        print(record)
        table.loc[len(table.index)]=record
        table.to_csv('test.csv',encoding='utf-8')
    print("第{0}页小区信息抓取完毕".format(p))

参考资料:

https://www.bilibili.com/video/BV1VL4y1378e?p=1

 

posted @ 2022-07-04 15:37  达拉蹦跶  阅读(664)  评论(1)    收藏  举报