链家网深圳各区租房信息爬取(多线程爬取)

代码:

import requests
from pyquery import PyQuery as pq
import csv
from threading import Thread,Lock
import time
import os

def get_one_url(url):

    try:
        headers={
            'Referer':'https: // s1.ljcdn.com / matrix_pc / dist / pc / src / common / css / common.css?_v = 20191213202326259',
            'User - Agent':'Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 76.0.3809.132Safari / 537.36',

        }
        # 获取响应
        response=requests.get(url,headers,proxies=None)
        # 判断请求状态
        if response.status_code==200:
            return response.text
        return None
    except requests.exceptions.ConnectionError as e:
        print('error',e.args)

def parse_one_url(html):
    doc=pq(html)
    items=doc('.content__list--item--main').items()
    for item in items:
        # 上锁,等待每个线程的解析执行完之后在执行其他线程
        mutes.acquire()
        info= {
            'content': item.find('a').text().split()[0],
            'room': item.find('a').text().split()[1],
            'direction': item.find('a').text().split()[2],
            'size':item.find('p.content__list--item--des').text().split('/')[1],
            'price':item.find('.content__list--item-price').text(),
            'location':item.find('.content__list--item--des a').text().replace(' ','-')
        }
        # 释放互斥锁
        mutes.release()
        print(info)
        # 生成器,使循环结束之后再返回
        yield info

# 创建互斥锁
mutes = Lock()

def man(page,city):
    url = 'https://sz.lianjia.com/zufang/%s/pg%d/#contentList'%(city,page)
    html = get_one_url(url)
    infos=parse_one_url(html)
    save(infos,city)


def save(infos,city):
    '''
    判断文件是否存在,如果存在就说明表头已经写了就不写表头,反之加上表头
    :param infos: 保存的信息
    :return:
    '''
    if os.path.exists('%s.csv' %(city)):
        with open('链家租房.csv', 'a', encoding='utf-8') as csvfile:
            fieldname = ['content', 'room', 'direction', 'size', 'location', 'price']
            writer = csv.DictWriter(csvfile, fieldname)
            for info in infos:
                writer.writerow(info)
    else:
        with open('%s.csv' %(city),'a',encoding='utf-8') as csvfile:
            fieldname=['content','room','direction','size','location','price']
            writer = csv.DictWriter(csvfile,fieldname) #DictWriter方法使csv文件可以写入字典
            writer.writeheader()
            for info in infos:
                writer.writerow(info)


if __name__=='__main__':
    cities = ['luohuqu', 'longhuaqu', 'futianqu']
    # 循环深圳各区的缩写
    for city in cities:
        ts =[]
        # 创建多个线程分页爬取,使爬虫效率大大提高
        for i in range(1,50):
            exec('t{0} = Thread(target=man,args=(i,city))'.format(i))
            exec('ts.append(t{0})'.format(i))
        for t in ts:
            t.start()
        # 等待所有线程全部执行结束
        time.sleep(4)
        print('%s租房信息存入成功' %(city))
    print('全部存入成功')

  改进:可以动态爬取,用户输入想要爬取的深圳区域,程序自动爬取并保存;可以多进程爬取,同时爬取不同区域的住房信息并分别保存不同的csv文件;如果怕被封ip,还可以连接代理的 api,用代理爬取。

posted @ 2019-12-20 20:11  季末并不寂寞  阅读(225)  评论(0)    收藏  举报