__author__ = 'dell'
# -*- coding: utf-8 -*-
from lxml import etree
import urllib2
import time
def loadCategory():
res = {}
f_txt = open('catetory.txt')
while True:
line = f_txt.readline()
if not line:
break
line = line.strip().decode('gbk')
tokens = line.split('\t')
if len(tokens) < 2:
continue
key = tokens[1].strip()
print key
val = tokens[0].strip()
res[key] = val
return res
def loadCity():
res = {}
f_txt = open('city.txt')
while True:
line = f_txt.readline()
if not line:
break
line = line.strip().decode('gbk')
tokens = line.split(':')
if len(tokens) < 2:
continue
key = tokens[0].strip()
val = tokens[1].strip()
if key in res.keys():
print 'repeated city:', key
else:
res[key] = val
return res
cats = loadCategory()
# for key in cats.keys():
# print key, cats[key]
citys = loadCity()
# for key in citys.keys():
# print key, citys[key]
print 'length of category:', len(cats)
print 'length of citys:', len(citys)
print 'generating urls ... ...'
standard = 'http://www.dianping.com/search/category/%s/%s'
def gen(cateName):
res = []
if cateName in cats.keys():
catId = cats[cateName]
for cityName in citys.keys():
cityId = citys[cityName]
url = standard % (cityId, catId)
res.append((url, cityName))
return res
else:
return res
def getHtml(url):
request = urllib2.Request(url)
request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:19.0) Gecko/20100101 Firefox/19.0')
doc = urllib2.urlopen(request, timeout=45).read().decode('utf8')
return doc
def getFetchHour(count):
return count * 5.0 / 3600
def getFetchDay(count):
return (count * 5.0 / 3600) / 24
urllist = gen(u'购物')
print len(urllist)
sum = 0
for u in urllist:
html = getHtml(u[0])
tree = etree.HTML(html)
hnc = tree.xpath("//span[@class='Color7']")
for hn in hnc:
strnum = hn.text.replace('(', '').replace(')', '')
print u[1], strnum
sum += int(strnum)
# time.sleep(5)
print sum
print 'fetch time (hour) :' + str(getFetchHour(sum))
print 'fetch time (day) :' + str(getFetchDay(sum))