xpath 获取深圳房源信息并导出csv

# -*- coding: utf-8 -*-
# @Time    : 2019/4/28 10:44
# @Author  : wujf
# @Email   : 1028540310@qq.com
# @File    : 采集房屋信息.py
# @Software: PyCharm

'''
https://sz.centanet.com/xiaoqu/g1/
https://sz.centanet.com/xiaoqu/g220/
'''

import requests
from lxml import etree
import pandas as pd

listhouse  = ["https://sz.centanet.com/xiaoqu/g{}".format(str(i)) for i in range(1,221)]

t = []
d = []
p = []
for url in listhouse:
    url_single = url+'/'
    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}  # 增加代理
    r = requests.get(url_single, timeout=30, headers=header)
    ret = r.content.decode()

    result = etree.HTML(ret)
    title  = result.xpath('//div[@class="house-item clearfix"]/div[@class="item-info fl"]/h4[@class="house-title"]/a/text()')

    t.append(title)
    #print(title)

    address = result.xpath('//div[@class="house-item clearfix"]/div[@class="item-info fl"]/p[@class="mid f14 f000"]/text()')
    '''处理列表换行符 \r\n 和多余字符及空格'''
    list2 = [x.strip() for x in address]
    set1 = list(set(list2))       #去重
    set1.sort(key=list2.index)    #排序 按照之前列表形式
    set1.remove('')               #去空格
    set1.remove('-')              #去字符串'-'
    d.append(set1)
    print(set1)


    price  = result.xpath('//div[@class="item-pricearea fr"]/p[@class="price-nub cRed tc"]/span/text()')
    p.append(price)
    #print(price)
    #break

title_sum   = sum(t, [])
address_sum = sum(d,[])
price_sum   = sum(p,[])

# 创建数据表
house = pd.DataFrame({'name': title_sum, 'address':address_sum, 'price': price_sum})
# 查看数据表的内容
house.head()
house.to_csv('房源信息表详细版.csv',encoding='gbk')
# house.to_sql()

  

posted @ 2019-04-28 17:11  龙卷风之殇  阅读(307)  评论(0)    收藏  举报