东篱野鹤

导航

初学爬虫之爬取某房产楼盘信息

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2019/12/24 17:01
# @Site    : 
# @File    : shell.py
# @Software: PyCharm

import os
import time
import json
import xlwt
import urllib3
import logging
import requests
from  pyquery import PyQuery

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)#fidder抓包忽略请求warning
headers = {
        "Referer": "https://ag.fang.ke.com/loupan",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
    }
#控制台日志输出
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("shell_room")
logger_detail = logging.getLogger("shell_room_detail")

def shell_room_page(pgmax):
    room_page_list = []
    num = 0#用于已获取页面总资源计数
    for i in range(1,pgmax+1):
        # urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)#fidder抓包忽略请求warning
        url = 'https://cq.fang.ke.com/loupan/pg'+str(i)
        logger.info("正在获取的链接:%s"%url)
        # print("正在获取的链接:%s"%url)
        response = requests.get(url,headers=headers,verify=False)
        logger.info("正在获取%s页房源......"%i)
        page_doc = PyQuery(response.text)
        print(type(page_doc))
        j = 0#y用于当前页资源计数
        for item in page_doc('.resblock-list-wrapper li ').items():
            # room_page_list.append(item.attr('data-project-name'))
            if item.attr('data-project-name')==None:
                # print(item)
                continue
            else:
                room_page_list.append(item.attr('data-project-name'))
                num+=1
                j+=1
        logger.debug("当前是第%s页,本页有%s套资源,当前共获取%s套资源!"%(i,j,num))
    print("房源信息代码:")
    print(room_page_list)
    return room_page_list

def shell_room_detail(list):
    now = time.strftime("%Y_%m_%d %H_%M_%S")
    no = 0
    # 创建一个workbook 设置编码
    workbook = xlwt.Workbook()
    # 创建一个worksheet
    worksheet = workbook.add_sheet('Shell_room')
    for j in range(0,len(list)):
        try:
            url = 'https://cq.fang.ke.com/loupan/p_'+list[j]
            logger.info("第%s套资源链接:%s"%(j+1,url))
            response = requests.get(url,headers=headers,verify=False)
            detail_doc = PyQuery(response.text)
            price_list = []
            tag_list = []
            #价格获取
            price = detail_doc('.price span')
            for pri in price.items():
                # print(pri.text())
                price_list.append(pri.text())
            # print(price_list)
            if  price_list[0]==u'价格待定':#未开盘且没有参考价格
                ref_ave_price = u'未开盘,价格待定'
                ref_total_price = u'未开盘,价格待定'
                ref_unit_price = u'未开盘,价格待定'
            elif price_list[3]==u'参考单价':#没有总价
                ref_ave_price = price_list[1]+price_list[2]
                ref_total_price = u'暂无总价'
                ref_unit_price = price_list[4]+price_list[5]
            else:#各报价齐全
                ref_ave_price = price_list[1]+price_list[2]
                ref_total_price = price_list[3]+price_list[4]
                ref_unit_price = price_list[6]+price_list[7]
            #最新开盘时间
            for open in detail_doc('.open-date span').items():
                if open.attr('class')=="content":
                    opendate = open.text()
                    break
                else:
                    opendate = u'未知'
            #项目地址
            for addr in detail_doc('.info-item span').items():
                # print(addr)
                if addr.attr('class')=="content":
                    addres = addr.text()
                    break
                else:
                    addres = u'未知'
            #标签获取
            pro_tag = ""
            for tag in detail_doc(".top-info ul li").items():
                if tag.attr('class')=="item":
                    tag_list.append(tag.text())
                    pro_tag = pro_tag+tag.text()+'/'
            #户型
            style = ""
            for style_room in  detail_doc('.content span').items():
                style = style+style_room.text()+'/'

            #楼盘在售状态和类型,只有售卖状态和类型且为必须项
            type_list = []
            for sell_house_type in detail_doc('.tags-wrap span').items():
                type_list.append(sell_house_type.text())
            if len(type_list) >=2 :
                type = type_list[0]+'/'+type_list[1]
            else:
                type = None

            pro_name = detail_doc('.title-wrap div h2').text()

            detail_dic = {
                "pro_name":pro_name,
                "room_code":list[j],
                "ref_ave_price":ref_ave_price,
                "ref_total_price":ref_total_price,
                "ref_unit_price":ref_unit_price,
                "new_open_date":opendate,
                "pro_addr":addres,
                "pro_tag":pro_tag,
                "style_room":style,
                "sell_house_type":type

            }
            logger.info("正在获取第%s套信息......"%(j+1))
            #逐条插入数据到excel
            key_list = (detail_dic.keys())
            value_list = (detail_dic.values())
            if no == 0 :
                for k in range(0,len(key_list)):
                    worksheet.write(0,k,key_list[k])
            no+=1
            for vk in range(0,len(value_list)):
                worksheet.write(no, vk, value_list[vk])
            workbook.save( os.path.dirname(os.path.abspath('.'))+'\\result_data\\Shell_room_'+now+'.xls')
            logger_detail.info("第%s条信息保存成功,success!!!"%(no))

            # print("单价为:%s"%detail_doc('.price span').text())
            print("第%s套信息:"%(j+1))
            print(json.dumps(detail_dic,encoding='UTF-8', ensure_ascii=False))
        except Exception as e:  #捕获所有异常并打印
            logger.info(format(e))
            continue

if __name__ == '__main__':
    list = shell_room_page(1)
    shell_room_detail(list)

posted on 2019-12-28 17:21  东篱野鹤  阅读(670)  评论(0)    收藏  举报