广州楼盘抓取分析-分析问题
上文其实还是有不少问题的。
1.顺序执行,效率比较慢;2.不能断点执行。
那么,解决办法是什么呢?
对于问题1,可以采用生产者消费者模式来改写,代码如下
# -*- coding: utf-8 -*-
#######################################################################
# Copyright (C) 2005-2016 UC Mobile Limited. All Rights Reserved
# File : first_sale_spider.py
#
# Creation : 2016/2/23 19:41
# Author : shufeng.lsf@ucweb.com
#######################################################################
import random
from threading import Thread
import requests
import re
import time
from pyquery import PyQuery as pq
from Queue import Queue
import MySQLdb
import uniout
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
community_list = []
HOST = "127.0.0.1"
USER = "root"
PASSWD = ""
DB = "house_analysis"
PORT = 3306
queue = Queue(10)
class DBOperate(object):
def __init__(self, host, user, passwd, db, port, charset="utf8"):
self.host = host
self.user = user
self.passwd = passwd
self.db = db
self.port = port
self.conn = MySQLdb.connect(self.host, self.user, self.passwd, self.db, self.port, charset="utf8")
self.cur = self.conn.cursor()
def insertSql(self,sql):
self.cur.execute(sql)
self.conn.commit()
def __del__(self):
self.cur.close()
self.conn.close()
def requestByGet(url):
r = requests.get(url)
return r.content
def getNextPage(content):
m = re.findall(r'<a href="(.+?)" class="next-page next-link">下一页</a>',content)
if len(m)>0:
next_url = m[0]
else:
next_url = ''
return next_url
def getCommunityList(content):
community_urls = re.findall(r'data-link="(http://gz.fang.anjuke.com/loupan/\d+?.html)"',content)
print "正在采集...",community_urls
if len(community_urls)>0:
return community_urls
def getHouseInfo(url):
p = pq(url)
name = p('h1').text().strip()
style = p('.house-item').text().split(",")[0].strip()
price = p('.sp-price').text().strip()
l = p('.lpAddr-text').text()
location = re.split('\[ | \]',l)
area = location[-2].split('-')[0].strip()
zone = location[-2].split('-')[1].strip()
address = location[-1].strip()
detail_location = location[-1].strip()
result = {
"name": name,
"area": area,
"location": zone,
"detail_location": detail_location,
"house_style": style,
"price": price
}
return result
def detailPageHandler(cur, detail_url):
result = getHouseInfo(detail_url)
print "result:",result
cur.insertSql("insert into first_sale (name,area,location,detail_location,house_style,price) VALUES('%s','%s','%s','%s','%s','%s')" % (
result['name'],
result['area'],
result['location'],
result['detail_location'],
result['house_style'],
result['price']
))
class UrlProducer(Thread):
def __init__(self, start_url):
Thread.__init__(self)
self.start_url = start_url
def run(self):
global queue
while True:
content = requestByGet(self.start_url)
next_url = getNextPage(content)
community_urls = getCommunityList(content)
for url in community_urls:
queue.put(url)
time.sleep(random.random())
print "进入队列的url:",url
if next_url != '':
self.start_url = next_url
continue
else:
break
class GetHouseInfo(Thread):
def __init__(self, cur):
Thread.__init__(self)
self.cur = cur
def run(self):
global queue
while True:
url = queue.get()
detailPageHandler(self.cur, url)
queue.task_done()
time.sleep(random.random())
print "处理完毕的url:", url
def main():
cur = DBOperate(host=HOST, user=USER, passwd=PASSWD, db=DB, port=PORT)
UrlProducer("http://gz.fang.anjuke.com/loupan/?from=navigation").start()
GetHouseInfo(cur).start()
if __name__ == '__main__':
main()
2.对于不能断点执行的问题,可以用异常捕获的方式将当前执行的url保存下来,下次直接从文件中读取执行即可。

浙公网安备 33010602011771号