Loading

【爬虫】项目篇-使用xpath爬取搜房网二手房信息

1)无代理+随机请求头+lxml.etree+百度地图api获取经纬度

#使用requests和xpath从搜房网上抓取福州地区的二手房房源信息
# (要求获取所有分页上的房源,且每套房源包含标题、楼盘、
# 地点、经纬度、面积、房型、楼层、朝向、建筑年代、单价、总价、经纪人、联系电话等,缺数据的留空)。
import requests
from lxml import etree
from fake_useragent import UserAgent
import cchardet
import re
import csv
import logging
import time
import os
# 获取随机请求头
headers = {
    'user-agent': UserAgent().ie,
    #'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36',
    #'cookie':'cityNum=28; cityNum=28; UM_distinctid=17c8cd027e5626-084f43eefc826-513c1e45-144000-17c8cd027e6706; uniqueName=688cf35d22519598d68971bf65fdde9b; Hm_lvt_d2801fc638056c1aac7e8008f41cf828=1634450995; cityNum=28; Hm_lvt_bc2dcba98591e113d6329631910c965b=1634450835,1634486994,1634569816; CNZZDATA1262285598=2125395324-1634440330-|1634564759; codeNum=eyJpdiI6IjVXR1wvaWYxbFNFeUhiRFd6cUFvVFBnPT0iLCJ2YWx1ZSI6IitzZVp5cmw3MlA1aXRTdXpDeDBhT0Yyd3dTOHhqeHNwbzFzWmtKYndrU2E4VlY2QVpuVFc4Y0hKemNrN0FHR1giLCJtYWMiOiJhNmU2Yzk5YmRjYTliNjRlZmYyM2UzMzhhZDgzYzI5OWM1NDRhMDhkMjE5ZjAwZGIyM2IyMjI2ODhhZjY4MGM3In0=; Hm_lpvt_bc2dcba98591e113d6329631910c965b=1634569881; XSRF-TOKEN=eyJpdiI6IndzdEdtS3NPbGdUeVwvSk5JOXgzWVdBPT0iLCJ2YWx1ZSI6Ikk2UE8yTzhJUjFMclI4K1RJU3pzdTMzMWRkRVk4VE04d1ZWOTJTVDNVdlwvM2Y2U3RJeXlrV09hdGFRN3dKNVNtQm84VXNRb0tjQ3o4Z2VEbHZlQmpwdz09IiwibWFjIjoiNzA2NGUyNDM0NWU2OGIyZjRlMTljZmU4M2ZjNzY4ZTA5ZTUwZjIxNWQyMWFhM2UzZDU5M2JmNWU3ZDc4YTM2ZCJ9; www_sofang_session=eyJpdiI6IlwvQzRpejFBKzdaNnVUQXV6OUFxN0d3PT0iLCJ2YWx1ZSI6Im54dVI2WDFySHlIRVBKTHdTc1RZNzZEd2hha3ltTE1GYVU3T2ZYdGc1bktTbW13MUFiU2dSeXl2Wk5saU5JVmlpTFZibit5QkxYSWoxTFl0ZzNUVmxBPT0iLCJtYWMiOiIyMGEyMTJlZTA0NWZjMDc4ZTFmYmI5MzAyYjc2NmRlNzE3Yjc4NDVlOWM5MWY5YzdhMzg3NjAwZmMyMDhkYjc3In0=; cityid=eyJpdiI6IkJDZ1B0VG1oeGQ4R28xUjFLRFdja3c9PSIsInZhbHVlIjoiQTd2V2RcL2VyTlkzRXJsdTRMREp3aFE9PSIsIm1hYyI6ImJlZTA2MjBhN2RhNTUyOWFiNTI0ZjYxYzE5ZmFiZjFiNzUxNGFhYzdmODQ0Njk4NTY4ZDE2ZjUxNTE1NWJmM2YifQ==; city=eyJpdiI6InBMTWhBelBUaW1VQWdvWXlma0hQU2c9PSIsInZhbHVlIjoicFhkdHQ5K1YyTzBZUkZUVklrZE1JTW9ueE1NVWVFQlRCTk5DQkJIdm5xZUtkNERzTHRXWjhKK0VJNkpFUnhqMDFrdmdCTWZIc1JBU1YxSWtzZUZlSWprMGtkZ0FPb1hDTjR5NlNKOWM5ZzJETktBcE1Gb091MGp0TjRqcEZTRmVoZmRVMUV0ZlhEdWwzeTB6WWNXbndTN3FRWXR6dHlsTGM0Um9YM0VzTE5QWGFqRnRIOWp4SG16YkVESU9YbHNXWDdsamdyY1NSNVJMN1k3cFRqZlBuQT09IiwibWFjIjoiZGRkOTAyODUxODllZjBjNTIzMThlMTA3YzhkOTM1ZDRhNGM2YTRkYjUyY2U2Y2NhYjg5Y2VlNGY0YWIwYWUyNSJ9; citypy=eyJpdiI6IlRROTdoN0ZCUVEyU2t6Mk5Ua0JVZFE9PSIsInZhbHVlIjoiZVRleVdFZXlBN2dxdEJkeUErZGp2QT09IiwibWFjIjoiOTk4ZjIwZGI5MGUzYjJmN2E1YTA5MjhlMmY1YzNlZTU2ODIyZjIwZGRmZmYzYmY0ZGFjY2FmYmI5MDc4OGIxZiJ9; cityId=eyJpdiI6Inkxc1l0SkNaN3ZZeFJMMTAySjFWanc9PSIsInZhbHVlIjoibkpyZkhOVDdmZkJZa3B1WEoram5pdz09IiwibWFjIjoiYTg0NGI5ZDNhMmExMzg3Mjg0YmY3MzAxMGVlMDcyNzk5NTNhOGE5NmMzNWViN2Q5NWVjYTVjODhmZDgwYjU1MyJ9'
}


#请求网页信息
def request_url(url):
    req=requests.get(url,headers=headers)
    req.encoding=cchardet.detect(req.content)['encoding']
    source=etree.HTML(req.text)
    return source

def get_condetion(source):

    #1.标题
    title_list=source.xpath('//p[@class="name"]/a/text()')
    time.sleep(5)
    #2.楼盘
    building_list=source.xpath('//strong/text()')
    time.sleep(5)
    #3.地址
    add_init=source.xpath("//span[@title]")
    add_list=[add.xpath("text()")[0] for add in add_init]
    # #去掉列表中的空格符和换行符
    add_list=[i.strip() for i in add_list if i.strip()!='']
    time.sleep(5)
    #使用百度api获取经纬度
    # for add in add_list:
    #     locate=getlocation_from_api(add)
    #     print(locate)

    #获取地图链接
    map_url=[''.join(i.xpath("following-sibling::*[1]/@href")) for i in add_init]
    time.sleep(5)
    #4.经纬度
    locate_list=[",".join([re.search("longitude=(.+)&latitude=(.+)",i,re.S).group(1),
                      re.search("longitude=(.+)&latitude=(.+)",i,re.S).group(2)])
            if i!='' else "" for i in map_url]
    time.sleep(5)
    #5.面积
    area_list=source.xpath('//p[@class="type clearfix"]/span[1]/text()')
    time.sleep(5)
    #6.房型
    layout_list=source.xpath('//p[@class="type clearfix"]/span[3]/text()')
    layout_list=[i.replace("\r\n","").replace(" ","") for i in layout_list]
    time.sleep(5)
    #7.楼层
    stories_list=source.xpath('//p[@class="type clearfix"]/span[5]/text()')
    time.sleep(5)

    #8.朝向
    direction_list=source.xpath('//p[@class="type clearfix"]/span[7]/text()')
    time.sleep(2)

    #9.建筑年代
    year_list=source.xpath('//p[@class="type clearfix"]/span[10]/text()')
    time.sleep(2)

    #10.单价
    uprice_list=source.xpath('//dd[@class="house_price"]/p[2]/text()')
    uprice_list=[i.replace("\r\n","").replace(" ","") for i in uprice_list]
    time.sleep(2)

    #11.总价
    tprice_list = source.xpath('//dd[@class="house_price"]/p[1]/span/text()')
    tprice_list=[ i+"万" for i in tprice_list]
    time.sleep(2)

    #12.经纪人
    agent_list=source.xpath('//a[@class="broker_name"]/text()')
    agent_list=[i.strip() for i in agent_list if i.strip() != '']

    #13.联系电话
    tel_url=source.xpath('//a[@class="broker_name"]/@href')
    tel_list=[]
    for url in tel_url:
        url="https://fz.sofang.com/"+url
        content=request_url(url)
        try:
            tel=content.xpath('//div[@class="broker_tel"]/text()')
            tel=[i.strip() for i in tel if i.strip() != '']
            tel_list.append(''.join(tel))
        except:
            tel_list.append("")
    #print(tel_list)
    time.sleep(2)

    #14.标签
    tag=source.xpath('//p[@class="tag clearfix"]')
    tag_list=[]
    for i in tag:
        i=i.xpath('span/text()')
        i="/".join(i)
        tag_list.append(i)
    #print(tag_list)
    time.sleep(2)
    for title,building,address,locate,area,layout,floor,direction,year,uprice\
            ,tprice,agent,tel,tag in \
            zip(title_list,building_list,add_list,locate_list,area_list,
                layout_list,stories_list,direction_list,year_list,uprice_list,
                tprice_list,agent_list,tel_list,tag_list):
        data = []
        data.append(title)
        data.append(building)
        data.append(address)
        data.append(locate)
        data.append(area)
        data.append(layout)
        data.append(floor)
        data.append(direction)
        data.append(year)
        data.append(uprice)
        data.append(tprice)
        data.append(agent)
        data.append(tel)
        data.append(tag)
        print(data)
        time.sleep(2)
        save(data)
def save(data):
    with open("搜房网-福州房价.csv","a",encoding="utf-8-sig",newline="") as f:
        a=csv.writer(f)
        a.writerow(data)

#调用百度地图api获取经纬度
def getlocation_from_api(address):
    bdurl = "https://api.map.baidu.com/geocoding/v3/?"
    params = {
        'address': address,
        'output': 'json',
        'ak': "UYKdveDmML50ykiqRIFfLekfCWcgeB4r",
        # 'callback': 'showlocation',
        'city': '福州'
    }
    req = requests.get(bdurl,headers=headers,params=params)
    #经度
    lgn=req.json()['result']['location']['lng']
    #纬度
    lat = req.json()['result']['location']['lat']
    return ','.join([str(lgn),str(lat)])

if __name__ == '__main__':
    base_url="https://fz.sofang.com/esfsale/area/"
    source=request_url(base_url)
    page=source.xpath('//li/a[@alt]/text()')[-3]
    if os.path.exists("搜房网-福州房价.csv")==True:
        pass
    else:
        with open("搜房网-福州房价.csv","w",encoding="utf-8-sig",newline="") as f:
            a=csv.writer(f)
            a.writerow(['标题','楼盘','地址','经纬度','面积','房型','楼层','朝向','建筑年代','单价','总价','经纪人','联系电话','标签'])
            f.close()

    for i in range(1,int(page)+1):
         url=base_url+"bl"+str(i)+"?"
         content=request_url(url)
         print(url)
         time.sleep(10)
         get_condetion(content)
         time. Sleep(10)

2)无代理+固定请求头+lxml.html

import requests
import cchardet
from lxml import html
import re
import xlsxwriter
import time
import random
head = {
'cookie': 'UM_distinctid=17c7f7b1d8b7c1-018dda850a8de5-b7a1b38-1fa400-17c7f7b1d8ccca; uniqueName=e1aac984e2899a3acdb3ec3f75190084; cityNum=28; Hm_lvt_bc2dcba98591e113d6329631910c965b=1634734120,1634791780,1634901813,1634905507; Hm_lvt_d2801fc638056c1aac7e8008f41cf828=1634280948,1634737369,1634798622,1634905516; Hm_lpvt_d2801fc638056c1aac7e8008f41cf828=1634905551; remember_82e5d2c56bdd0811318f0cf078b78bfc=eyJpdiI6Ild4S0I2ZVJPMjFLQkw3V1l1UTF6Mmc9PSIsInZhbHVlIjoiKytpb3NNTVRlV3Ntcjh2Vk9rSHB4QzFZNVRPTjNZWmF3UUNYdENESFwvWTA1dEw3SHlkaHYzXC9jb2lBZEVSelowaFNUQkdqS3Q1MWVsWTlnVVIxV29rY04rWThpdW1ZeUh6SnRYZzE3ZjNEZz0iLCJtYWMiOiJiOGFlZDZiNWQwNWMyNjBkMjhlMmQwZDMyMTU3YzQwYzM4YzdhYjcxM2Q5ODAyYzgyZWM3MzYzNWJlNjg2ZjhhIn0%3D; codeNum=eyJpdiI6IjVqSUJSNDlQVE1jaGRoN3NqemtnSlE9PSIsInZhbHVlIjoiS25vdzVGNjNLcVVNKzlQTVN2YSt0aW12VjR5K1A5U0lTRFUya2hndnZGOFlKZmdlaVFlTW1BTndoYTE0TncwRCIsIm1hYyI6IjJjYzBjYWU0YmYxNjJkOGFmZDUwOTg1OWNjNzBiNDY0ZjczNTBlMTY5ZTlhMGVlMDc0YWZmY2Q2MzI0NjkxYzEifQ%3D%3D; CNZZDATA1262285598=1501788919-1634225051-https%253A%252F%252Fwww.baidu.com%252F%7C1634907133; Hm_lpvt_bc2dcba98591e113d6329631910c965b=1634912068; XSRF-TOKEN=eyJpdiI6IlpMMVZ0VWFGYXUrUVMwclJ1a0phQXc9PSIsInZhbHVlIjoiM0dtWGZrSGJLN2o1bitZdWVINnZJWTNWWUhjc1N4Yk5pUjd1UkpaSnM3d2xLMTErTUZJVzcrZkdUdUprdFFmTVNwQloyZEZZSGdsYzkrK3hzV1RcL1pnPT0iLCJtYWMiOiIwNDRkNmQ3MGU3MGY4NzQ0NDFjMjUyNWVlYjgxZGIwNThkNWU3YTdlZDhhMWU0MzkxZTA1YWQ4MTYxMDAzNjkzIn0%3D; www_sofang_session=eyJpdiI6Ik5UclBnbnBWbnE1OWxQV3FlXC94dU53PT0iLCJ2YWx1ZSI6IjFlMVUwUGZsMDFHRUl3V0p5VlZcL2hSakVnSjFqTllPa0RYZE95Wk95NElFcW53TDk3eldCVUVcL2hZMUgxaEhUaVwvRGI0eHc1MW5FcW8zaUEzdFhiVDBRPT0iLCJtYWMiOiI4NGRhZWZkZGEwZmViYmQ1NTE3MTc4MzJiYTNlNmFhYmY2NGZjMTFhYjkyZGIyNTFjY2Q3MzU2YzgyM2Y4ZDNiIn0%3D; cityid=eyJpdiI6InF3VlV2WXY0YXowVmh5OUNCQnRMQUE9PSIsInZhbHVlIjoiVmtLWTRiXC9QMk9OWWZJUFpJbHJ4NHc9PSIsIm1hYyI6IjQ3NGQ3YzY4ZjQ3ZjVhNTMyNTJmODQ2ZTljYTA0NTY3YTRhYTNiODc0NWNlNTgyZmViMmMyZWRiYjM5OTYwY2IifQ%3D%3D; city=eyJpdiI6IlM4ZUtzcWt4QnY2RHppaTllaFFFTmc9PSIsInZhbHVlIjoiUndaXC92OFdycENKNW1lU01SUUZEdDZLQm1zMjJieVdJOWhJb25vN3FpeWZoV0xpWXBXTDdkejQ3b05cL1pTNUk5ME5HbXl3TWZkUWF6cW9icEZaS05kUkgrOFJlMTdaQmFxZ1lwdWx0elcxXC9OcTNFRGtGZm5zcnBlaFBCXC91ZzJ1RjNDY29WaHNGZWJMUGJHUDkzSGJiYnNoWWFXVXhaN09kOEZVVFVkVm50SjZLK0hVMVhYOGVweXY1NkxHbXhrXC9uQktIc3FJUnpvZFwvXC9JWTBcL3Y2ZlJBPT0iLCJtYWMiOiI0NTM3MDU4ODA2YzAzYWRhOTQ3YjhlOTZlOGU4ZWIyMGU4NTRkMTk4ZWM4MTEzODZhNmJmZmNlNDNmYTgxOGRjIn0%3D; citypy=eyJpdiI6ImJrZHhySjNTKzhONmpaeEpHK0RHRHc9PSIsInZhbHVlIjoiZmd2Q051VGpTZEhNTFBiM1FpbHRCZz09IiwibWFjIjoiYmUxNTc3NWMyYTg5ZTA3M2U3NWQ3OTIyZTkwNmE3MzJmNDQ3OTJhMzM1MjY4N2Q0NTc0NzEyMTExY2YzOGQwZSJ9; cityId=eyJpdiI6IkZaNjU4QnU4TXNibldnYWEwenpZSEE9PSIsInZhbHVlIjoidVZBVFhnVE1QamZrY1o0ZWNqTnVnUT09IiwibWFjIjoiYzk2NjRmNjZmZjY5MWVkZTY0OGVjOWM2ZGI4YzJkYjExMjJmMzE5Y2NhMmJjYmE3YmVkYzQ5Njc2MGRiZDdhNCJ9',
# 'upgrade-insecure-requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
}

def get_source(url):
    resp = requests.get(url, headers=head)
    resp.encoding = cchardet.detect(resp.content)['encoding']
    return resp.text

def is_url_in_list(url, house_info_list):
    i = 0
    while i < len(house_info_list):
        if url == house_info_list[i][12]:
            return i
        i += 1
    return -1

def get_data_secondhand_house_onepage(source, house_info_list):
    root = html.fromstring(source)
    house_resources_list = root.xpath('//div[@class="list_l"]/div/dl')
    for house_resource in house_resources_list:
        title = house_resource.xpath('dd[@class="house_msg"]/p[@class="name"]/a/text()')[0]
        estate = house_resource.xpath('dd[@class="house_msg"]/div[starts-with(@class,"house_info")]/p[starts-with(@class,"area")]//strong/text()')[0]
        address = house_resource.xpath('dd[@class="house_msg"]//span[@class="address"]/text()')[0].strip()
        longitude_latitude = house_resource.xpath('dd[@class="house_msg"]/div[starts-with(@class,"house_info")]/p[starts-with(@class,"area")]/a[@target="_blank"]/@href')
        if longitude_latitude:
            longitude_latitude = re.search(r'longitude=(.*?)&latitude=(.*)', longitude_latitude[0], re.S)
            longitude_latitude = '{},{}'.format(longitude_latitude.group(1), longitude_latitude.group(2))
        else:
            longitude_latitude = ''
        house_type = house_resource.xpath('dd[@class="house_msg"]/div[starts-with(@class,"house_info")]/p[starts-with(@class,"type")]')[0]
        area = house_type.xpath('span[1]/text()')[0]
        layout = re.sub('[\s\r\n]', '', house_type.xpath('span[3]/text()')[0])
        storey = house_type.xpath('span[5]/text()')[0]
        orientations = house_type.xpath('span[7]/text()')[0]
        construction_time = house_type.xpath('span[10]')
        if construction_time:
            construction_time = construction_time[0].xpath('text()')[0]
        else:
            construction_time = ''
        price = house_resource.xpath('dd[@class="house_price"]/p')[0].xpath('string(.)').strip()
        unit_price = house_resource.xpath('dd[@class="house_price"]/p[@class="junjia"]/text()')[0].strip()
        broker_a_tag = house_resource.xpath('dd[@class="house_msg"]/div[starts-with(@class,"house_info")]/p[starts-with(@class,"tag")]/a[@class="broker_name"]')[0]
        broker_url = 'https://fz.sofang.com' + broker_a_tag.xpath('@href')[0]
        broker_name = broker_a_tag.xpath('string(.)').strip()
        pos_in_house_info_list = is_url_in_list(broker_url, house_info_list)
        if pos_in_house_info_list != -1:
            broker_tel = house_info_list[pos_in_house_info_list][13]
        else:
            time.sleep(random.randint(1, 3))
            broker_root = html.fromstring(get_source(broker_url))
            broker_tel = broker_root.xpath('//div[@class="broker_tel"]')[0].xpath('string(.)').strip()
        print('标题:{}\n楼盘:{}\n地点:{}\n经纬度:{}\n面积:{}\n房型:{}\n楼层:{}\n朝向:{}\n建筑年份:{}\n总价:{}\n单价:{}\n经纪人:{}\n经纪人url:{}\n电话:{}\n'
              .format(title,estate,address,longitude_latitude,area,layout,storey,orientations,construction_time,price,unit_price,broker_name,broker_url,broker_tel))
        house_info_list.append((title, estate, address, longitude_latitude, area, layout, storey, orientations,
                                construction_time, price, unit_price, broker_name, broker_url, broker_tel))


def get_next_url(source):
    root = html.fromstring(source)
    url_next_page = root.xpath('//a[contains(text(),"下一页")]/@href')
    if url_next_page:
        url_next_page = 'https://fz.sofang.com' + url_next_page[0]
    return url_next_page

def save_data(house_info_list):
    workbook = xlsxwriter.Workbook('soufang.xlsx')
    worksheet = workbook.add_worksheet()
    # worksheet.set_column('A:A', 20)
    # worksheet.set_column('B:B', 10)
    worksheet.write('A1', '标题')
    worksheet.write('B1', '楼盘')
    worksheet.write('C1', '地点')
    worksheet.write('D1', '经纬度')
    worksheet.write('E1', '面积')
    worksheet.write('F1', '房型')
    worksheet.write('G1', '楼层')
    worksheet.write('H1', '朝向')
    worksheet.write('I1', '建筑年份')
    worksheet.write('J1', '总价')
    worksheet.write('K1', '单价')
    worksheet.write('L1', '经纪人')
    worksheet.write('M1', '经纪人url')
    worksheet.write('N1', '电话')
    i = 2
    for house_info in house_info_list:
        j = 0
        for data in house_info:
            worksheet.write('{}{}'.format(chr(ord('A')+j), i), data)
            j += 1
        i += 1
    workbook.close()


if __name__ == '__main__':
    house_info_list = []
    i = 1
    url = 'https://fz.sofang.com/esfsale/area'
    while True:
        # 访问搜房网福州二手房页面,获得网页源码
        head['referer'] = 'https://fz.sofang.com/esfsale/area/bl{}?'.format(i)  #重要!网站反爬时采用了请求头中的referer字段
        source = get_source(url)
        # 从源码中提取需要的数据
        get_data_secondhand_house_onepage(source, house_info_list)
        time.sleep(random.randint(1, 3))
        print(f'已提取第{i}页的数据\n')
        url = get_next_url(source)
        if not url:
            break
        i += 1

    # # 输出数据
    # for house_info in house_info_list:
    #     print('标题:{}\n楼盘:{}\n地点:{}\n经纬度:{}\n面积:{}\n房型:{}\n楼层:{}\n朝向:{}\n建筑年份:{}\n总价:{}\n单价:{}\n经纪人:{}\n经纪人url:{}\n电话:{}\n'
    #           .format(house_info[0],house_info[1],house_info[2],house_info[3],house_info[4],
    #                   house_info[5],house_info[6],house_info[7],house_info[8],house_info[9],house_info[10],house_info[11],house_info[12],house_info[13]))
    
    save_data(house_info_list)

3)无代理+固定请求头+使用HTMLSeesion

from requests_html import HTMLSession
import re
import xlsxwriter
import time
import random
# from fake_useragent import UserAgent


head = {
'cookie': 'cityNum=28; cityNum=28; UM_distinctid=17c7f7b1d8b7c1-018dda850a8de5-b7a1b38-1fa400-17c7f7b1d8ccca; uniqueName=e1aac984e2899a3acdb3ec3f75190084; cityNum=28; remember_82e5d2c56bdd0811318f0cf078b78bfc=eyJpdiI6Ild4S0I2ZVJPMjFLQkw3V1l1UTF6Mmc9PSIsInZhbHVlIjoiKytpb3NNTVRlV3Ntcjh2Vk9rSHB4QzFZNVRPTjNZWmF3UUNYdENESFwvWTA1dEw3SHlkaHYzXC9jb2lBZEVSelowaFNUQkdqS3Q1MWVsWTlnVVIxV29rY04rWThpdW1ZeUh6SnRYZzE3ZjNEZz0iLCJtYWMiOiJiOGFlZDZiNWQwNWMyNjBkMjhlMmQwZDMyMTU3YzQwYzM4YzdhYjcxM2Q5ODAyYzgyZWM3MzYzNWJlNjg2ZjhhIn0%3D; Hm_lvt_d2801fc638056c1aac7e8008f41cf828=1634737369,1634798622,1634905516,1634974173; Hm_lvt_bc2dcba98591e113d6329631910c965b=1634901813,1634905507,1634974076,1635006212; CNZZDATA1262285598=1501788919-1634225051-https%253A%252F%252Fwww.baidu.com%252F%7C1635000137; codeNum=eyJpdiI6ImhCZHgwVTduV1R0bjNnc1R4QmJDMWc9PSIsInZhbHVlIjoiWTZ2UjRPKzFBTEhKWlZBRGdnNFdQRlJEOHlRelZVSTQzZHNBSnJnd3NQWVdBc2JyVXZ2dmpVYlpkT1lseExleCIsIm1hYyI6IjdkYjFkZjE2YTY3Y2IxYmExNmYwNWM0OThjMmM3MjUwMmMwNWU4ODFkYjQ3YzAyMTU4NDdkNDk0MzFlYzFkYTYifQ%3D%3D; Hm_lpvt_bc2dcba98591e113d6329631910c965b=1635006288; XSRF-TOKEN=eyJpdiI6IithNWpjSUNmRFdSZkp3THFENnNsQ1E9PSIsInZhbHVlIjoiR0YyOTJVNGo5SXhZdlVpZ1o2NnRJVWFSVnZ5WXNpbkpxRjJBbEVMTW9FWXR1bVkybndpcGo1ZlN5eVU1Y3ArbEpkVkQ0U2RUV1BmaWJnOXRlY3RwekE9PSIsIm1hYyI6IjJlODY0ZDc1MjcwYzFiNDlhNzBmYzk5MDZjNzAxOTJmMmI4Y2U2YTdmNzQxMjY0Y2EwMDZiNjM4YWQwN2NhZDYifQ%3D%3D; www_sofang_session=eyJpdiI6ImpCSDl0TlNnQXpNWEFNNmJlQzU5eHc9PSIsInZhbHVlIjoiU1U5dkR3SnQyaEdPQzhvcXpYUXVlaTZsbnJhS3hudFU1QjZVM0ZjbFVaYzVNNVo1XC81NTVhdXZmM0luUkFCblFJOUp1c2JUR3dJblBCZStGZEVDcjRBPT0iLCJtYWMiOiJlZDczNmFjMGQ3ZjMxOGE4ZTQ0MjM4Y2UwODI1MzQzODBkNzA0NjIwNjgzYThiODgyMThkNGJjNTkzZWYyZmFkIn0%3D; cityid=eyJpdiI6InNsOEtBdUdsb0RhYUN2QWdqa2FFM3c9PSIsInZhbHVlIjoidFdkK2JpdmgyVHV1MnlQdDdBSzcxQT09IiwibWFjIjoiODI5YTRmNDJjM2JiMDA2M2NhOWJjYTU5ZWQ4NmJkMTNhYmI1YjE5ZWI2YWNjODNjMjY0M2RlYTQ0ZDhlZWMzZiJ9; city=eyJpdiI6InVscGlKYzg0WHVTbnNjS0Ztb29QNEE9PSIsInZhbHVlIjoiXC9LXC85Y1B0ZG05a2lKSjRYdVFoR0poV0tZRzhNazZKZ1lOajJUWVlMTXZaMHJRZG0weDFSVDRVejdRMlRJNktZYituRHBBWHpNb3N4aW54dGJ3MlFhSWNlS3pGWE9Jek81elB1eW5Lb2hYN05hcng0UXpCVzM1MHVIbVJEXC9IQlpHVVNnazBjeEY3c1BZMkZtbUQ0bGFhQXluTUFZZjAzYUJ5UGFMbVhnQUJ6bFZnbHRuZEhFNU9Sb0xQczBCU0VtSFwvNzh3Ym1tRDFFSDE4OXBCM2RVRUE9PSIsIm1hYyI6ImM0ZmVjMjY3ZWRlYWQ1M2JkNTNjMzYxY2E0NTJlYjk1ZjYyM2VhMjI1MjA0NDJjZGNlODgwYTE2NzdiOTYxZTcifQ%3D%3D; citypy=eyJpdiI6ImFhRklYOHBQVHZ1bm1YS1lPZDJBbnc9PSIsInZhbHVlIjoiZ1M2WkJPWkRLV0lLWmc3aGhjY0p2Zz09IiwibWFjIjoiMTJmYWQxMTExNTc0MzJkZGVhMWM3MTViZDU2NDE5Y2FlZGNmNDhiNGE0NjA5NzE4YmQ1OTQzMDY1ZDE0ZWZmNyJ9; cityId=eyJpdiI6IlZDYjFHQjg1cDB0cXNBNVh2cVh2S0E9PSIsInZhbHVlIjoidUpTMzNjSXVON3dvTnZub3RDWklzUT09IiwibWFjIjoiYjg3ODlmY2VmMjgxMGI2YWQ1ZjU5ZjZmMmFhM2NmZGQ3NDE4YzEzNTRiOGE0ZjdiN2JhM2IzZjk3NWE0YWQ2ZSJ9',
'upgrade-insecure-requests': '1',
'Connection': 'close',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
}

# def get_useragent():
#     location = os.getcwd() + '\\fake_useragent_0.1.11.json'
#     ua = UserAgent(path=location)
#     return ua.random

def get_source(url):
    session = HTMLSession()
    resp = session.get(url, headers=head, verify=False)
    return resp.html

def is_url_in_list(url, house_info_list):
    i = 0
    while i < len(house_info_list):
        if url == house_info_list[i][12]:
            return i
        i += 1
    return -1

def get_data_secondhand_house_onepage(source, house_info_list):
    house_resources_list = source.xpath('//div[@class="list_l"]/div/dl')
    for house_resource in house_resources_list:
        title = house_resource.xpath('//dd[@class="house_msg"]/p[@class="name"]/a/text()')[0]
        estate = house_resource.xpath('//dd[@class="house_msg"]/div[starts-with(@class,"house_info")]/p[starts-with(@class,"area")]//strong/text()')[0]
        address = house_resource.xpath('//dd[@class="house_msg"]//span[@class="address"]/text()')[0].strip()
        longitude_latitude = house_resource.xpath('//dd[@class="house_msg"]/div[starts-with(@class,"house_info")]/p[starts-with(@class,"area")]/a[@target="_blank"]/@href')
        if longitude_latitude:
            longitude_latitude = re.search(r'longitude=(.*?)&latitude=(.*)', longitude_latitude[0], re.S)
            longitude_latitude = '{},{}'.format(longitude_latitude.group(1), longitude_latitude.group(2))
        else:
            longitude_latitude = ''
        house_type = house_resource.xpath('//dd[@class="house_msg"]/div[starts-with(@class,"house_info")]/p[starts-with(@class,"type")]')[0]
        area = house_type.xpath('//span[1]/text()')[0]
        layout = re.sub('[\s\r\n]', '', house_type.xpath('//span[3]/text()')[0])
        storey = house_type.xpath('//span[5]/text()')[0]
        orientations = house_type.xpath('//span[7]/text()')[0]
        construction_time = house_type.xpath('//span[10]/text()')
        if construction_time:
            construction_time = construction_time[0]
        else:
            construction_time = ''
        price = house_resource.xpath('//dd[@class="house_price"]/p')[0].text
        unit_price = house_resource.xpath('//dd[@class="house_price"]/p[@class="junjia"]/text()')[0].strip()
        broker_a_tag = house_resource.xpath('//dd[@class="house_msg"]/div[starts-with(@class,"house_info")]/p[starts-with(@class,"tag")]/a[@class="broker_name"]')[0]
        broker_url = 'https://fz.sofang.com' + broker_a_tag.xpath('//@href')[0]
        broker_name = broker_a_tag.text
        pos_in_house_info_list = is_url_in_list(broker_url, house_info_list)
        if pos_in_house_info_list != -1:
            broker_tel = house_info_list[pos_in_house_info_list][13]
        else:
            time.sleep(random.randint(1, 3))
            source_broker = get_source(broker_url)
            broker_tel = source_broker.xpath('//div[@class="broker_tel"]')[0].text
        print('标题:{}\n楼盘:{}\n地点:{}\n经纬度:{}\n面积:{}\n房型:{}\n楼层:{}\n朝向:{}\n建筑年份:{}\n总价:{}\n单价:{}\n经纪人:{}\n经纪人url:{}\n电话:{}\n'
              .format(title,estate,address,longitude_latitude,area,layout,storey,orientations,construction_time,price,unit_price,broker_name,broker_url,broker_tel))
        house_info_list.append((title, estate, address, longitude_latitude, area, layout, storey, orientations,
                                construction_time, price, unit_price, broker_name, broker_url, broker_tel))


def get_next_url(source):
    url_next_page = source.xpath('//a[contains(text(),"下一页")]/@href')
    if url_next_page:
        url_next_page = 'https://fz.sofang.com' + url_next_page[0]
    return url_next_page

def save_data(house_info_list):
    workbook = xlsxwriter.Workbook('soufang.xlsx')
    worksheet = workbook.add_worksheet()
    # worksheet.set_column('A:A', 20)
    # worksheet.set_column('B:B', 10)
    worksheet.write('A1', '标题')
    worksheet.write('B1', '楼盘')
    worksheet.write('C1', '地点')
    worksheet.write('D1', '经纬度')
    worksheet.write('E1', '面积')
    worksheet.write('F1', '房型')
    worksheet.write('G1', '楼层')
    worksheet.write('H1', '朝向')
    worksheet.write('I1', '建筑年份')
    worksheet.write('J1', '总价')
    worksheet.write('K1', '单价')
    worksheet.write('L1', '经纪人')
    worksheet.write('M1', '经纪人url')
    worksheet.write('N1', '电话')
    i = 2
    for house_info in house_info_list:
        j = 0
        for data in house_info:
            worksheet.write('{}{}'.format(chr(ord('A')+j), i), data)
            j += 1
        i += 1
    workbook.close()


if __name__ == '__main__':
    house_info_list = []
    i = 0
    url = 'https://fz.sofang.com/esfsale/area'
    while True:
        # 访问搜房网福州二手房页面,获得网页源码
        head['referer'] = 'https://fz.sofang.com/esfsale/area/bl{}?'.format(i)
        source = get_source(url)
        # 从源码中提取需要的数据
        get_data_secondhand_house_onepage(source, house_info_list)
        time.sleep(random.randint(1, 3))
        print(f'已提取第{i+1}页的数据\n')
        url = get_next_url(source)
        if not url:
            break
        i += 1

    # 输出数据
    # for house_info in house_info_list:
    #     print('标题:{}\n楼盘:{}\n地点:{}\n经纬度:{}\n面积:{}\n房型:{}\n楼层:{}\n朝向:{}\n建筑年份:{}\n总价:{}\n单价:{}\n经纪人:{}\n经纪人url:{}\n电话:{}\n'
    #           .format(house_info[0],house_info[1],house_info[2],house_info[3],house_info[4],
    #                   house_info[5],house_info[6],house_info[7],house_info[8],house_info[9],house_info[10],house_info[11],house_info[12],house_info[13]))
    
    save_data(house_info_list)

4)代理池+固定请求头+lxml.html

import requests
import cchardet
from lxml import html
import re
import xlsxwriter
import time
import random
from requests.exceptions import ConnectionError
proxies_lst = [{'http':'http://1.83.117.252:8118'},{'http':'http://118.117.188.175:3256'},{'http':'http://117.68.192.118:1133'},
               {'http':'http://66.183.100.156:3128'},{'http':'http://197.255.253.210:58136'},{'http':'http://218.64.142.178:9999'},
               {'http':'http://47.98.183.59:3128'},{'http':'http://27.159.188.65:3256'},{'http':'http://27.159.188.220:3256'},
               {'http':'http://27.159.188.228:3256'},{'http':'http://27.159.188.168:3256'},{'http':'http://27.150.87.98'},
               {'http':'http://27.157.193.55:8888'},{'http':'http://61.154.64.120:9999'},{'http':'http://27.159.188.17:3256'},
               {'http':'http://182.92.194.49:8118'},{'http':'http://47.106.127.219:39746'},{'http':'http://58.220.95.8:10174'}]

head = {

'cookie': 'UM_distinctid=17c7f7b1d8b7c1-018dda850a8de5-b7a1b38-1fa400-17c7f7b1d8ccca; uniqueName=e1aac984e2899a3acdb3ec3f75190084; cityNum=28; Hm_lvt_bc2dcba98591e113d6329631910c965b=1634734120,1634791780,1634901813,1634905507; Hm_lvt_d2801fc638056c1aac7e8008f41cf828=1634280948,1634737369,1634798622,1634905516; Hm_lpvt_d2801fc638056c1aac7e8008f41cf828=1634905551; remember_82e5d2c56bdd0811318f0cf078b78bfc=eyJpdiI6Ild4S0I2ZVJPMjFLQkw3V1l1UTF6Mmc9PSIsInZhbHVlIjoiKytpb3NNTVRlV3Ntcjh2Vk9rSHB4QzFZNVRPTjNZWmF3UUNYdENESFwvWTA1dEw3SHlkaHYzXC9jb2lBZEVSelowaFNUQkdqS3Q1MWVsWTlnVVIxV29rY04rWThpdW1ZeUh6SnRYZzE3ZjNEZz0iLCJtYWMiOiJiOGFlZDZiNWQwNWMyNjBkMjhlMmQwZDMyMTU3YzQwYzM4YzdhYjcxM2Q5ODAyYzgyZWM3MzYzNWJlNjg2ZjhhIn0%3D; codeNum=eyJpdiI6IjVqSUJSNDlQVE1jaGRoN3NqemtnSlE9PSIsInZhbHVlIjoiS25vdzVGNjNLcVVNKzlQTVN2YSt0aW12VjR5K1A5U0lTRFUya2hndnZGOFlKZmdlaVFlTW1BTndoYTE0TncwRCIsIm1hYyI6IjJjYzBjYWU0YmYxNjJkOGFmZDUwOTg1OWNjNzBiNDY0ZjczNTBlMTY5ZTlhMGVlMDc0YWZmY2Q2MzI0NjkxYzEifQ%3D%3D; CNZZDATA1262285598=1501788919-1634225051-https%253A%252F%252Fwww.baidu.com%252F%7C1634907133; Hm_lpvt_bc2dcba98591e113d6329631910c965b=1634912068; XSRF-TOKEN=eyJpdiI6IlpMMVZ0VWFGYXUrUVMwclJ1a0phQXc9PSIsInZhbHVlIjoiM0dtWGZrSGJLN2o1bitZdWVINnZJWTNWWUhjc1N4Yk5pUjd1UkpaSnM3d2xLMTErTUZJVzcrZkdUdUprdFFmTVNwQloyZEZZSGdsYzkrK3hzV1RcL1pnPT0iLCJtYWMiOiIwNDRkNmQ3MGU3MGY4NzQ0NDFjMjUyNWVlYjgxZGIwNThkNWU3YTdlZDhhMWU0MzkxZTA1YWQ4MTYxMDAzNjkzIn0%3D; www_sofang_session=eyJpdiI6Ik5UclBnbnBWbnE1OWxQV3FlXC94dU53PT0iLCJ2YWx1ZSI6IjFlMVUwUGZsMDFHRUl3V0p5VlZcL2hSakVnSjFqTllPa0RYZE95Wk95NElFcW53TDk3eldCVUVcL2hZMUgxaEhUaVwvRGI0eHc1MW5FcW8zaUEzdFhiVDBRPT0iLCJtYWMiOiI4NGRhZWZkZGEwZmViYmQ1NTE3MTc4MzJiYTNlNmFhYmY2NGZjMTFhYjkyZGIyNTFjY2Q3MzU2YzgyM2Y4ZDNiIn0%3D; cityid=eyJpdiI6InF3VlV2WXY0YXowVmh5OUNCQnRMQUE9PSIsInZhbHVlIjoiVmtLWTRiXC9QMk9OWWZJUFpJbHJ4NHc9PSIsIm1hYyI6IjQ3NGQ3YzY4ZjQ3ZjVhNTMyNTJmODQ2ZTljYTA0NTY3YTRhYTNiODc0NWNlNTgyZmViMmMyZWRiYjM5OTYwY2IifQ%3D%3D; city=eyJpdiI6IlM4ZUtzcWt4QnY2RHppaTllaFFFTmc9PSIsInZhbHVlIjoiUndaXC92OFdycENKNW1lU01SUUZEdDZLQm1zMjJieVdJOWhJb25vN3FpeWZoV0xpWXBXTDdkejQ3b05cL1pTNUk5ME5HbXl3TWZkUWF6cW9icEZaS05kUkgrOFJlMTdaQmFxZ1lwdWx0elcxXC9OcTNFRGtGZm5zcnBlaFBCXC91ZzJ1RjNDY29WaHNGZWJMUGJHUDkzSGJiYnNoWWFXVXhaN09kOEZVVFVkVm50SjZLK0hVMVhYOGVweXY1NkxHbXhrXC9uQktIc3FJUnpvZFwvXC9JWTBcL3Y2ZlJBPT0iLCJtYWMiOiI0NTM3MDU4ODA2YzAzYWRhOTQ3YjhlOTZlOGU4ZWIyMGU4NTRkMTk4ZWM4MTEzODZhNmJmZmNlNDNmYTgxOGRjIn0%3D; citypy=eyJpdiI6ImJrZHhySjNTKzhONmpaeEpHK0RHRHc9PSIsInZhbHVlIjoiZmd2Q051VGpTZEhNTFBiM1FpbHRCZz09IiwibWFjIjoiYmUxNTc3NWMyYTg5ZTA3M2U3NWQ3OTIyZTkwNmE3MzJmNDQ3OTJhMzM1MjY4N2Q0NTc0NzEyMTExY2YzOGQwZSJ9; cityId=eyJpdiI6IkZaNjU4QnU4TXNibldnYWEwenpZSEE9PSIsInZhbHVlIjoidVZBVFhnVE1QamZrY1o0ZWNqTnVnUT09IiwibWFjIjoiYzk2NjRmNjZmZjY5MWVkZTY0OGVjOWM2ZGI4YzJkYjExMjJmMzE5Y2NhMmJjYmE3YmVkYzQ5Njc2MGRiZDdhNCJ9',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
}

def get_source(url):
    try_times = 0
    while True:
        try:
            resp = requests.get(url, headers=head, proxies=proxies_lst[try_times]) #random.choice(proxies_lst))
            break
        except ConnectionError:
            try_times += 1
            if try_times == len(proxies_lst):
                print('所有代理服务器都无法使用')
                exit(0)
    resp.encoding = cchardet.detect(resp.content)['encoding']
    return resp.text

def get_data_secondhand_house_onepage(source, house_info_list):
    root = html.fromstring(source)
    house_resources_list = root.xpath('//div[@class="list_l"]/div/dl')
    for house_resource in house_resources_list:
        title = house_resource.xpath('dd[@class="house_msg"]/p[@class="name"]/a/text()')[0]
        estate = house_resource.xpath('dd[@class="house_msg"]/div[starts-with(@class,"house_info")]/p[starts-with(@class,"area")]//strong/text()')[0]
        address = house_resource.xpath('dd[@class="house_msg"]//span[@class="address"]/text()')[0].strip()
        longitude_latitude = house_resource.xpath('dd[@class="house_msg"]/div[starts-with(@class,"house_info")]/p[starts-with(@class,"area")]/a[@target="_blank"]/@href')
        if longitude_latitude:
            longitude_latitude = re.search(r'longitude=(.*?)&latitude=(.*)', longitude_latitude[0], re.S)
            longitude_latitude = '{},{}'.format(longitude_latitude.group(1), longitude_latitude.group(2))
        else:
            longitude_latitude = ''
        house_type = house_resource.xpath('dd[@class="house_msg"]/div[starts-with(@class,"house_info")]/p[starts-with(@class,"type")]')[0]
        area = house_type.xpath('span[1]/text()')[0]
        layout = re.sub('[\s\r\n]', '', house_type.xpath('span[3]/text()')[0])
        storey = house_type.xpath('span[5]/text()')[0]
        orientations = house_type.xpath('span[7]/text()')[0]
        construction_time = house_type.xpath('span[10]')
        if construction_time:
            construction_time = construction_time[0].xpath('text()')[0]
        else:
            construction_time = ''
        price = house_resource.xpath('dd[@class="house_price"]/p')[0].xpath('string(.)').strip()
        unit_price = house_resource.xpath('dd[@class="house_price"]/p[@class="junjia"]/text()')[0].strip()
        broker_a_tag = house_resource.xpath('dd[@class="house_msg"]/div[starts-with(@class,"house_info")]/p[starts-with(@class,"tag")]/a[@class="broker_name"]')[0]
        broker_url = 'https://fz.sofang.com' + broker_a_tag.xpath('@href')[0]
        broker_name = broker_a_tag.xpath('string(.)').strip()
        time.sleep(random.randint(5, 10))
        broker_root = html.fromstring(get_source(broker_url))
        broker_tel = broker_root.xpath('//div[@class="broker_tel"]')[0].xpath('string(.)').strip()
        print('标题:{}\n楼盘:{}\n地点:{}\n经纬度:{}\n面积:{}\n房型:{}\n楼层:{}\n朝向:{}\n建筑年份:{}\n总价:{}\n单价:{}\n经纪人:{}\n电话:{}\n'
              .format(title,estate,address,longitude_latitude,area,layout,storey,orientations,construction_time,price,unit_price,broker_name,broker_tel))
        house_info_list.append((title, estate, address, longitude_latitude, area, layout, storey, orientations,
                                construction_time, price, unit_price, broker_name, broker_tel))


def get_next_url(source):
    root = html.fromstring(source)
    url_next_page = root.xpath('//a[contains(text(),"下一页")]/@href')
    if url_next_page:
        url_next_page = 'https://fz.sofang.com' + url_next_page[0]
    return url_next_page

def save_data(house_info_list):
    workbook = xlsxwriter.Workbook('soufang.xlsx')
    worksheet = workbook.add_worksheet()
    # worksheet.set_column('A:A', 20)
    # worksheet.set_column('B:B', 10)
    worksheet.write('A1', '标题')
    worksheet.write('B1', '楼盘')
    worksheet.write('C1', '地点')
    worksheet.write('D1', '经纬度')
    worksheet.write('E1', '面积')
    worksheet.write('F1', '房型')
    worksheet.write('G1', '楼层')
    worksheet.write('H1', '朝向')
    worksheet.write('I1', '建筑年份')
    worksheet.write('J1', '总价')
    worksheet.write('K1', '单价')
    worksheet.write('L1', '经纪人')
    worksheet.write('M1', '电话')
    i = 2
    for house_info in house_info_list:
        j = 0
        for data in house_info:
            worksheet.write('{}{}'.format(chr(ord('A')+j), i), data)
            j += 1
        i += 1
    workbook.close()


if __name__ == '__main__':
    house_info_list = []
    i = 1
    url = 'https://fz.sofang.com/esfsale/area'
    while True:
        # 访问搜房网福州二手房页面,获得网页源码
        source = get_source(url)
        # 从源码中提取需要的数据
        get_data_secondhand_house_onepage(source, house_info_list)
        time.sleep(random.randint(10, 15))
        print(f'已提取第{i}页的数据')
        url = get_next_url(source)
        if not url:
            break
        i += 1

    # 输出数据
    for house_info in house_info_list:
        print('标题:{}\n楼盘:{}\n地点:{}\n经纬度:{}\n面积:{}\n房型:{}\n楼层:{}\n朝向:{}\n建筑年份:{}\n总价:{}\n单价:{}'
              .format(house_info[0],house_info[1],house_info[2],house_info[3],house_info[4],
                      house_info[5],house_info[6],house_info[7],house_info[8],house_info[9],house_info[10]))
    
    save_data(house_info_list)
posted @ 2024-04-05 21:31  踩坑大王  阅读(105)  评论(0)    收藏  举报