数据分析实战(8)-贝壳租房Xpath爬虫+数据分析实战

sadsadsadsa 

import requests
from lxml import etree

basic_url = "https://xa.zu.ke.com"
url = "https://xa.zu.ke.com/zufang/"
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}

html = requests.get(url=url,headers=header).text
tree = etree.HTML(html)

# 获取div标签列表
div_list = tree.xpath('//div[@class="content__list"]/div')
for div in div_list:
    try:
        # 数据解析
        name = div.xpath('.//p[1]/a/text()')[0]
        print(name)

        target_url = basic_url + div.xpath('.//p[1]/a/@href')[0]
        print(target_url)

        area = div.xpath('.//p[2]/a[1]/text()')[0]
        print(area)

        subdivide = div.xpath('.//p[2]/a[2]/text()')[0]
        print(subdivide)

        #community_name = div.xpath('.//p[2]/a[2]/text()')   # 有问题,茶张新元
        #print(community_name)

        space_size = div.xpath('.//p[2]/text()')[4]
        print(space_size)

        towards = div.xpath('.//p[2]/text()')[5]
        print(towards)

        room_type = div.xpath('.//p[2]/text()')[6]
        print(room_type)


        #apartment_name = div.xpath('.//p[2]/p/text()')[0]   # 有问题,西安梧桐公寓
        #print(apartment_name)

        floor = div.xpath('.//p[2]/span/text()')[1]
        print(floor)

        last_updated = div.xpath('.//p[3]/text()')[0]
        print(last_updated)

        is_new = div.xpath('.//p[4]/i[1]/text()')[0]
        print(is_new)

        #rent_type = div.xpath('.//p[4]/i[3]/text()')[0]
        #print(rent_type)

        decoration = div.xpath('div[1]/p[4]/i[4]/text()')
        print(decoration)

        price = div.xpath('.//span/em/text()')[0]
        print(price)

        data_unit = div.xpath('./div[1]/span/text()')[0]
        print(data_unit)
        break
    except IndexError:
        pass
posted @ 2019-12-06 16:14  麦小秋  阅读(534)  评论(0编辑  收藏  举报