#!/bin/python
#encoding=utf-8
import urllib,lxml.html,lxml.html.soupparser,re,pymongo
url1="http://sou.zhaopin.com/jobs/SearchResult.ashx?in=210500%3b160400%3b160000&pd=1&jl=%E7%A6%8F%E5%B7%9E&sm=0&et=2&p=1"
url="http://sou.zhaopin.com/jobs/SearchResult.ashx?in=210500%3b160400%3b160000&pd=1&jl=%E7%A6%8F%E5%B7%9E&sm=0&et=2&p="
def getDocument(url,code='utf-8'):
doc=lxml.html.fromstring(urllib.urlopen(url).read().decode(code,'ignore'))
return doc
def getLastPageNumber(url):
lastPageXpath="//div[@class='pagesDown']/ul/li[last()-3]"
doc=getDocument(url)
return int(doc.xpath(lastPageXpath)[0].text_content())
def getTodayTime(url):
timeXpath="//td[@class='releasetime']/text()"
doc=getDocument(url)
return doc.xpath(timeXpath)[0].strip()
def getPageListAndUrl():
titleXpath="//tr[@class='showTR']/td[@class='Jobname']/a/text()"
titleLinkXpath="//tr[@class='showTR']/td[@class='Jobname']/a/@href"
title=[]
titleUrls=[]
for i in range(1,getLastPageNumber(url1)+1):
doc=getDocument(url+str(i))
title+=doc.xpath(titleXpath)
titleUrls+=doc.xpath(titleLinkXpath)
#return dict(zip(title,titleUrls))
return dict(zip(title,titleUrls))
def filterWord(jobList):
AccessWord=[u'前端',u'web',u'程序',u'工程',u'技术',u'\\']
for k,v in jobList.items():
if len([True for word in AccessWord if word in k])==0:
del jobList[k]
return jobList
def sort(url):
peopleNumberXpath="//table[@class='terminalpage-table']//tr[3]/td[2]/text()"
moneyXpath="//table[@class='terminalpage-table table-margin']//tr[3]/td[2]/text()"
companyNameXpath="//td/h2/a/text()"
doc=getDocument(url)
try:
companyName=doc.xpath(companyNameXpath)[0].strip()
except:
return 0
#Important Number
try:
people=int(re.findall(r'\d+',doc.xpath(peopleNumberXpath)[0])[0])
except:
return 0
try:
money=int(re.findall(r'\d+',doc.xpath(moneyXpath)[0])[0])
except:
money=3000
companyIndex=getBaiduIndex(companyName)
weight=int((0.4*people+0.4*money+0.2*companyIndex))
return weight
def getBaiduIndex(keyword):
keyword=keyword.encode('utf-8')
indexXpath="//span[@class='nums']/text()"
url='http://www.baidu.com/s?wd="'+keyword+'"'
doc=getDocument(url)
index=doc.xpath(indexXpath)
if index==[]:
return 0
else:
index=index[0].replace(',','')
index=re.findall(r'\d+',index)
return int(index[0])
def writeDataIntoDb(document):
collection=pymongo.Connection('localhost',27017).job.list
try:
collection.insert(document)
except:
print 'Error'
def generateRecord(newList):
for k,v in newList.items():
record={'occupy':k,'url':v,'weight':sort(v),'time':time}
writeDataIntoDb(record)
time=getTodayTime(url1)
jobList=getPageListAndUrl()
newList=filterWord(jobList)
generateRecord(newList)
~