#!/bin/python
#coding=utf-8
import urllib,xlrd,lxml.html,re,pymongo,xlwt
fail=open('fail','w')
def getDocument(url,code='utf-8'):
try:
doc=lxml.html.fromstring(urllib.urlopen(url).read().decode(code))
print 'utf-8'
except:
doc=lxml.html.fromstring(urllib.urlopen(url).read().decode('Gb2312'))
return doc
def getBaiduIndex(keyword):
keyword=keyword.encode('utf-8')
indexXpath="//span[@class='nums']/text()"
url='http://www.baidu.com/s?wd="'+keyword+'"'
doc=getDocument(url)
index=doc.xpath(indexXpath)
if index==[]:
return 0
else:
index=index[0].replace(',','')
index=re.findall(r'\d+',index)
return int(index[0])
def readExcelData():
data=xlrd.open_workbook('福建111.xls')
table=data.sheets()[0]
rowsNumber=table.nrows
for i in range(0,rowsNumber):
print i
line=table.row_values(i)
try:
index=getBaiduIndex(table.row_values(i)[0])
except:
fail.write(str(line))
line.append(index)
tmp=['name','address','lawer','occupy','phone','cellphone','weight']
line=dict(zip(tmp,line))
insertDatabase(line)
def readSotedDb():
collection=pymongo.Connection('localhost',27017).excel.index
newData=collection.find({},{'_id':0}).sort("weight",pymongo.DESCENDING)
loop=0
book=xlwt.Workbook()
sheet=book.add_sheet('sheet 1')
for i in newData:
sheet.write(loop,0,i['name'])
sheet.write(loop,1,i['address'])
sheet.write(loop,2,i['lawer'])
sheet.write(loop,3,i['occupy'])
sheet.write(loop,4,i['phone'])
sheet.write(loop,5,i['cellphone'])
loop+=1
book.save('new.xls')
def insertDatabase(document):
collection=pymongo.Connection('localhost',27017).excel.index
try:
collection.insert(document)
except:
print 'Insert Data Error',document
readExcelData()