爬取智联招聘信息
import scrapy
from jobspider.items import JobspiderItem
import logging
class JobSpider(scrapy.Spider):
name = "job_spider"
start_urls = [
"http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&kw=java&isadv=0&sg=df4b40a6bfaf49c08ef0cb9e8e2181f2&p=1"
]
def parse(self, response):
# save html file.
# filename = 'zhaopin.html'
# with open(filename, 'wb') as f:
# f.write(response.body)
# self.log('Saved file %s' % filename)
jobs = response.xpath('//div[@id="newlist_list_content_table"]/table[@class="newlist"]')
for job in jobs[1:]:
item = JobspiderItem()
item['jobname'] = ''.join(job.xpath('.//td[@class="zwmc"]/div/a//text()').extract())
item['companyname'] = job.xpath('.//td[@class="gsmc"]/a/text()').extract_first()
item['salary'] = job.xpath('.//td[@class="zwyx"]/text()').extract_first()
item['workplace'] = job.xpath('.//td[@class="gzdd"]/text()').extract_first()
yield item
爬取智联招聘。
智联html结构:
<div class="newlist_list_content" id="newlist_list_content_table">
<table class="newlist" width="853" cellspacing="0" cellpadding="0">
<tr>
<td class="zwmc" style="width: 250px;">
<input name="vacancyid" data-monitor="CZ751712970J00017764214|3" value="CZ751712970J00017764214_719_1_03_409__1_" onclick="zlapply.uncheckAll('allvacancyid')" type="checkbox">
<div style="width: 224px;*width: 218px; _width:200px; float: left">
<a style="font-weight: bold" par="ssidkey=y&ss=409&ff=03&sg=df4b40a6bfaf49c08ef0cb9e8e2181f2&so=3" href="http://jobs.zhaopin.com/CZ751712970J00017764214.htm" target="_blank"><b>java</b>开发工程师 </a><a href="http://e.zhaopin.com/products/1/detail.do" target="_blank" title="点击“顶”字,了解更多"><img src="/assets/images/top.png" border="0" align="absmiddle"> <img src="/assets/images/jp.gif" border="0" align="absmiddle"></a>
</div>
</td>
<td style="width: 60px;" class="fk_lv"><span>64%</span></td>
<td class="gsmc"><a href="http://company.zhaopin.com/CZ751712970.htm" target="_blank">北京中科网联信息技术研究院(有限合伙)</a> <a href="http://company.zhaopin.com/CZ751712970.htm" target="_blank" style="vertical-align: top;"><img src="//img03.zhaopin.cn/IHRNB/img/souvip1002.png" alt="1002" class="icon_vip" border="0" align="absmiddle"></a></td>
<td class="zwyx">4001-6000</td>
<td class="gzdd">郑州</td>
<td class="gxsj"><span>置顶</span><a class="newlist_list_xlbtn" href="javascript:;"></a></td>
</tr>
<tr style="display: none" class="newlist_tr_detail">
<td style="line-height: 0;" colspan="6" width="833px">
<div class="newlist_detail">
<div class="clearfix">
<ul>
<li class="newlist_deatil_two"><span>地点:郑州</span><span>公司性质:民营</span><span>经验:1-3年</span><span>学历:不限</span><span>职位月薪:4001-6000元/月</span></li><li class="newlist_deatil_last">...<b>Java</b>开发经验,熟悉J2EE体系结构,并能熟悉掌握SSH等开源框架; 3. 能熟练掌握和开发Web Service、SOAP、Socket、NIO等开发技术,对http、tcp、udp协议有一定的了解; 4. 精通Ajax、<b>Java</b>Script、HTML5等前...</li>
</ul>
<dl>
<dt>
<a href="javascript:zlapply.searchjob.ajaxApplyBrig1('CZ751712970J00017764214_719','ssi','_1_03_409__2_');searchMonitor.logSingleApplyData('CZ751712970J00017764214|3');">
<img src="/assets/images/newlist_sqimg_03.jpg">
</a>
</dt>
<dd><a href="javascript:zlapply.searchjob.saveOne('CZ751712970J00017764214_719');"><img src="/assets/images/newlist_scimg_06.jpg"></a></dd>
</dl>
</div>
</div>
</td></tr>
</table>
</div>

浙公网安备 33010602011771号