#-*-coding:utf-8-*- #编码声明,不要忘记! import requests #这里使用requests,小脚本用它最合适! from lxml import html #这里我们用lxml,也就是xpath的方法 import itertools import re import codecs def formatstr(str1): pattern1 = re.compile(r'(([1-9]\d*(\.\d*[1-9])?)|(0\.\d*[1-9]))平') pattern2 = re.compile(r'(([1-9]\d*(\.\d*[1-9])?)|(0\.\d*[1-9]))万') match1_mianji = pattern1.search(str1) match2_jiage = pattern2.search(str1) if (match1_mianji and match2_jiage): mianji=match1_mianji.group(1) jiage=match2_jiage.group(1) danjia=float(jiage)/float(mianji)*10000 # str_formated=str1 +'\t'+'面积:'+mianji+'\t'+'总价'+jiage+'\t'+str(int(danjia)) str_formated = '面积' +'\t'+ mianji + '\t' + '总价' +'\t'+ jiage + '\t' + str(int(danjia)) + '\t' + str1 print str_formated return str_formated else: return None def export_title(url,f): page = requests.get(url) if page is None: return False #对获取到的page格式化操作,方便后面用XPath来解析 tree = html.fromstring(page.text) #XPath解析,获得你要的文字段落!# #intro_raw = tree.xpath('//th[contains(text(),"出售")]/a/text()') intro_raw = tree.xpath('//th[contains(@class,"new")]/a/text()') #intro_raw = tree.xpath('//th/a/text()') #简单的转码工作,这步根据需要可以省略 #print intro_raw for i in intro_raw: intro = i.encode('utf-8').replace('New','') if intro!='': #print intro try: intro.replace('\t', '').replace('\n', '').replace(' ', '') intro_formated=formatstr(intro) if intro_formated is not None: print intro_formated f.write(intro_formated+'\n') except Exception as e: print e return False return True if __name__ == "__main__": with open('C:/tobacco/11/dyfj.csv', 'w') as f: f.write(codecs.BOM_UTF8) for page in itertools.count(1): URL='http://bbs.212300.com/forum-56-%d.html' % page print URL flag =export_title(URL,f) if(not flag ): print 'finished!!' break else: pass # URL = 'http://bbs.212300.com/forum-56-1.html' # export_title(URL, f) f.close()
浙公网安备 33010602011771号