麦田厦门小区信息数据爬取

刚开始爬取的时候没有用headers伪装成是浏览器,导致麦田北京和福州小区把我的ip给禁掉了,还好后来发现原因也还剩下厦门小区没被我弄坏,代码如下:

#-*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
page_url = "http://xm.maitian.cn/xqall"
headers = {"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
           "Referer":"http://xm.maitian.cn/esfall",
           "Connection":"keep-alive",
           "Content-Type":"text/plain; charset=utf-8"}


def get_communities_url():
    all_data =[]
    try:
        reponse = requests.get(url=page_url,headers=headers)
    except Exception as e:
        print("请求连接错误")
        raise e

    soup = BeautifulSoup(reponse.text,"lxml")
    soup = soup.find("div","list_wrap")
    tag_li = soup.find_all("li")
    for tag_li in soup.find_all("li"):
        href = tag_li.h1.a['href']
        new_url = page_url.replace("/xqall",href)
        #all_url.append(new_url)
        dict_data =get_target_info(new_url)
        if dict_data:
            all_data.append(dict_data)
    #print(all_data)
    return all_data

def get_target_info(new_url):
    # all_url = get_communities_url()
    # print(len(all_url))

    dict = {}

    try:
        reponse = requests.get(url=new_url,headers=headers)
    except Exception as e:
        print("请求连接错误")
        raise e

    #print(reponse.text)
    soup = BeautifulSoup(reponse.text,'lxml')
    soup1 = soup.find("section","home_main")
    ps = soup1.find_all("p")
    # 小区均价
    community_avg = ps[0].b.string.strip()
    dict["community_avg"] =community_avg
    #待售房源
    unsold_homes = ps[1].find_all("em")[0].a.string+""
    dict["unsold_homes"] = unsold_homes
    #待租房源
    rent_homes = ps[1].find_all("em")[1].a.string + ""
    dict["rent_homes"] = rent_homes
    #所属商圈
    business_circle = ps[2].label.string
    dict["business_circle"] =business_circle
    #开发商
    developers = ps[2].em.string
    dict["developers"] = developers

    soup2 = soup.find("ul","home_details")
    for tag_li in soup2.find_all("li"):
        if tag_li["class"] == ['li_left']:
            p = tag_li.find_all("p")
            #建筑面积
            area=p[0].em.string
            dict["area"] = area
            #物业公司
            property_company=p[1].em.string
            dict["property_company"] = property_company
            #物业费
            industry_fee = p[2].em.string
            dict["industry_fee"] = industry_fee

        elif tag_li["class"] == ['li_center']:
            p = tag_li.find_all("p")
            #建成年代
            built_year = p[0].em.string
            dict["built_year"] = built_year
            #房屋总数
            total_houses = p[1].em.string
            dict["total_houses"] = total_houses
            #绿化率
            green_rates = p[2].em.string
            dict["green_rates"] =green_rates

        elif tag_li["class"] == ['li_right']:
            p = tag_li.find_all("p")
            # 占地面积
            cover_area = p[0].em.string
            dict["cover_area"] = cover_area
            # 楼栋总数
            total_built = p[1].em.string
            dict["total_built"] = total_built
            # 容积率
            product_rates = p[2].em.string
            dict["product_rates"] = product_rates
    return dict



if __name__ == '__main__':
    data_all = get_communities_url()
    print(data_all)

 

posted @ 2019-08-26 22:28  智、心  阅读(300)  评论(0编辑  收藏  举报