首次scrapy的58爬虫
house_test
# -*- coding: utf-8 -*-
import scrapy
from ..items import HouseItem
from pyquery import PyQuery
class HouseTestSpider(scrapy.Spider):
name = 'house_test'
allowed_domains = ['58.com']
start_urls = ['https://hz.58.com/chuzu/']#初始想要爬取的网址
def parse(self, response):
s = PyQuery(response.text)
li_list = s('body > div.list-wrap > div.list-box > ul > li').items()#取出列表
for it in li_list:
a_tag = it('div.des > h2 > a')
item = HouseItem()
item['name'] = a_tag.text()
item['url'] = a_tag.attr('href')
item['price'] = it('div.list-li-right > div.money > b').text()
yield item
pipelines
import json
import pandas as pd
class HousePipeline(object):
def open_spider(self,spider):
self.file = open('58.txt','w',encoding='utf-8')
print("打开文件")
def process_item(self, item, spider):
line='{}\n'.format(json.dumps(dict(item),ensure_ascii=False))#转换成中文
self.file.write(line)
return item
def close_spider(self,spider):
self.file.close()
print("关闭文件")
main
from scrapy.cmdline import execute
execute('scrapy crawl house_test'.split())
items
import scrapy
#在管道这命名,到spider上去用
class HouseItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
price = scrapy.Field()
url = scrapy.Field()