# 爬虫文件
# -*- coding: utf-8 -*-
import scrapy
import re
from soufangwang.items import NewHouseItem,SecondhandHouseItem
class FangspiderSpider(scrapy.Spider):
name = 'fangSpider'
allowed_domains = ['fang.com']
start_urls = ['https://www.fang.com/SoufunFamily.html']
def parse(self, response):
# print(response)
trs = response.xpath("//div[@class='outCont']//tr")
province = None
for tr in trs:
#找没有class='font01' 属性的td标签 注意
tds = tr.xpath(".//td[not(@class)]")
province_td = tds[0]
province_text = province_td.xpath('.//text()').get()
# 把空白字符'\s'去掉
province_text = re.sub(r'\s','',province_text)
if province_text:
province = province_text
# 不爬取海外城市
if province == '其它':
continue
city_td = tds[1]
city_links = city_td.xpath('.//a')
for city_link in city_links:
city = city_link.xpath('.//text()').get()
city_url = city_link.xpath('.//@href').get()
# 构建新房url 连接
new_url = city_url.split(".")
if new_url[0].endswith('bj'):
newhouse_url = "https://"+"newhouse."+"fang.com/house/s/"
secondhand_url="https://"+"esf."+"fang.com/"
else:
newhouse_url = new_url[0]+".newhouse."+new_url[1]+"."+new_url[2]+"house/s/"
# 构建二手房的url
secondhand_url = new_url[0]+".esf."+new_url[1]+"."+new_url[2]
print(city,'新房链接:',newhouse_url)
print(city,'二手房链接:',secondhand_url)
yield scrapy.Request(url=newhouse_url,callback=self.parse_newhouse,meta={'info':(province,city,newhouse_url)})
yield scrapy.Request(url=secondhand_url,callback=self.parse_esf,dont_filter=True,meta={'info':(province,city,secondhand_url)})
# break
# break
# 解析新房数据
def parse_newhouse(self,response):
try:
# 请求传参
province,city,newhouse_url = response.meta.get('info')
li_list = response.xpath('//div[contains(@class,"nl_con")]/ul/li')
for li in li_list:
try:
name = li.xpath('.//div[@class="nlcd_name"]/a/text()').get().strip()
# print(name)
house_type_list = li.xpath('.//div[contains(@class,"house_type")]//text()').getall()
house_type_list = list(map(lambda x:x.replace("\n","").replace("\t",""),house_type_list))
house_type_str = ''.join(house_type_list).strip().split("-")
house_type = house_type_str[0]
area = house_type_str[1]
# print(house_type)
# print(area)
address = li.xpath('.//div[@class="address"]/a/text()').getall()
address = list(map(lambda x:x.replace("\n","").replace("\t",""),address))[1]
district = li.xpath('.//div[@class="address"]/a/span/text()').get().strip()
district = district[1:-1]
# print(district)
sale = li.xpath('.//div[@class="fangyuan"]/span[1]/text()').get()
# print(sale)
price = li.xpath('.//div[@class="nhouse_price"]//text()').getall()
price = list(map(lambda x:x.replace("\n","").replace("\t",""),price))
price = "".join(price)
# print(price)
origin_url = li.xpath('.//div[@class="nlcd_name"]/a/@href').get()
origin_url = "https:" + origin_url
# print(origin_url)
item = NewHouseItem(province=province,city=city,name=name,price=price,rooms=house_type,
area=area,address=address,district=district,sale=sale,origin_url=origin_url)
yield item
except:
pass
# 下一页
next_url = response.xpath('//div[@class="page"]//a[@class="next"]/@href').get()
if next_url:
next_url = next_url.split("/")[-2]
next_url = newhouse_url+next_url
# print(next_url)
yield scrapy.Request(url=next_url, callback=self.parse_newhouse,meta={'info':(province,city,newhouse_url)})
except:
print("无房子信息")
# 解析二手房数据
def parse_esf(self,response):
try:
# 请求传参
province,city,secondhand_url = response.meta.get('info')
dls = response.xpath('//div[@class="main945 floatl"]/div[@class="shop_list shop_list_4"]/dl[@dataflag="bg"]')
for dl in dls:
name = dl.xpath('.//h4[@class="clearfix"]//span[@class="tit_shop"]/text()').get()
# print(name)
price = dl.xpath('.//dd[@class="price_right"]/span//text()').getall()
unit = price[-1]
price = list(map(lambda x:x.replace("\n","").replace("\t",""),price))[:-1]
price = "".join(price)
room_info = dl.xpath('.//p[@class="tel_shop"]/text()').getall()
room_info = list(map(lambda x:x.replace("\n","").replace("\r","").strip(),room_info))[:5]
if room_info[0] == "独栋":
rooms = room_info[1]
area = room_info[3]
floor = room_info[0]
toward = room_info[4]
else:
try:
rooms = room_info[0]
area = room_info[1]
floor = room_info[2]
print(province, city)
print(room_info)
toward = room_info[3]
year = room_info[4]
except:
pass
address = dl.xpath('.//p[@class="add_shop"]/span/text()').get()
# print(address)
origin_url = dl.xpath('.//h4[@class="clearfix"]/a/@href').get()
origin_url = secondhand_url[:-1]+origin_url
# print(origin_url)
item = SecondhandHouseItem(province=province,city=city,name=name,price=price,unit=unit,rooms=rooms,
area=area,floor=floor,toward=toward,year=year,address=address,origin_url=origin_url)
yield item
# 下一页
next_url = response.xpath('//div[@class="page_al"]/p/a/@href').get()
if next_url:
next_url = secondhand_url[:-1] + next_url
# print(next_url)
yield scrapy.Request(url=next_url, callback=self.parse_esf,meta={'info':(province,city,secondhand_url)})
except:
print("本页没有房源信息")
# 管道文件
from scrapy.exporters import JsonLinesItemExporter
class SoufangwangPipeline(object):
def __init__(self):
self.newhouse_fp = open('newhouse.json','wb')
self.esfhouse_fp = open('esfhouse.json','wb')
self.newhouse_exporter=JsonLinesItemExporter(
self.newhouse_fp,ensure_ascii=False
)
self.esfhouse_exporter=JsonLinesItemExporter(
self.esfhouse_fp,ensure_ascii=False
)
def process_item(self, item, spider):
self.newhouse_exporter.export_item(item)
self.esfhouse_exporter.export_item(item)
return item
def close_spider(self,spider):
self.newhouse_fp.close()
self.esfhouse_fp.close()
# item文件
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NewHouseItem(scrapy.Item):
# define the fields for your item here like:
# 省份
province = scrapy.Field()
# 城市
city = scrapy.Field()
# 小区名字
name = scrapy.Field()
# 价格
price = scrapy.Field()
# 几居室, 是个列表
rooms = scrapy.Field()
# 面积
area = scrapy.Field()
# 地址
address = scrapy.Field()
# 行政区
district = scrapy.Field()
# 是否在售
sale = scrapy.Field()
# 房天下详情页面
origin_url = scrapy.Field()
class SecondhandHouseItem(scrapy.Item):
# define the fields for your item here like:
# 省份
province = scrapy.Field()
# 城市
city = scrapy.Field()
# 小区名字
name = scrapy.Field()
# 价格
price = scrapy.Field()
# 单价
unit = scrapy.Field()
# 几居室, 是个列表
rooms = scrapy.Field()
# 层
floor = scrapy.Field()
# 朝向
toward = scrapy.Field()
# 年代
year = scrapy.Field()
# 面积
area = scrapy.Field()
# 地址
address = scrapy.Field()
# 房天下详情页面
origin_url = scrapy.Field()
# 中间件 设置随机请求头
# -*- coding: utf-8 -*-
import random
class UserAgentDownloadMiddleware(object):
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
def process_request(self,request,spider):
user_agent = random.choice(self.USER_AGENTS)
request.headers['User-Agent'] = user_agent