python爬取邮箱

上次纠结了那么久。结果不用编码也是可以匹配邮箱的。

下面是一个用队列实现,广度优先的简单爬虫代码。先就这样吧,目测暂时不会再理它了,以后有时间再修改。(又是一个烂尾。。。。。)

 

View Code
  1 # -*- coding: cp936 -*-
  2  import urllib2
  3  import re
  4  from pyquery import PyQuery as pq
  5  from lxml import etree
  6   
  7  #mailpattern = re.compile('[^\._:>\\-][\w\.-]+@(?:[A-Za-z0-9]+\.)+[A-Za-z]+')
  8  mailpattern = re.compile('[A-Za-z0-9_]+@(?:[A-Za-z0-9]+\.)+[A-Za-z]+')
  9  
 10  htmlcount = 0  #to count the urls
 11  maxcount = 3000 # the max count
 12  allUrls = set()
 13  allMails = set()
 14  UrlsQlist = []
 15  UrlsQdict = {}
 16  url = "http://www.163.com"
 17  fmails = open("E:/py/crawler/mailresult.txt","a")
 18  furls = open("E:/py/crawler/urlresult.txt","a")
 19  
 20  
 21   
 22   
 23  def geturls(data):#the function to get the urls in the html
 24      urls = set()
 25      if data:  
 26          d = pq(data)
 27          label_a = d.find('a')#用pyquery库去找到 a 标签.
 28          if label_a:
 29              label_a_href = d('a').map(lambda i,e:pq(e)('a').attr('href'))
 30              for u in label_a_href:
 31                  if u[0:10]!="javascript" :  
 32                      if u[0:4] == "http":
 33                          urls.add(u)
 34                      else:
 35                          urls.add(url + u)              
 36              #for u in urls:
 37                  #print u
 38          return urls
 39      else:
 40          return None
 41          
 42  def gethtml(url):
 43      try:
 44          fp = urllib2.urlopen(url)
 45      except:
 46          print "urllib2.urlopen error"
 47          return None
 48      else:
 49          mybytes =fp.read()
 50          fp.close()
 51          return mybytes
 52      
 53  def savemails(data): # the function to save the emails
 54      if data:
 55          mailResult = mailpattern.findall(data)
 56          mailResultset = set(mailResult)
 57          if mailResultset:
 58              allMails.update(mailResultset)
 59          
 60  def savehtml(pagecontent,count):
 61      if pagecontent != None:
 62          f = open("E:/py/crawler/html/"+str(count)+".html","w")
 63          f.write(pagecontent)
 64          f.close()
 65      else:
 66          f = open("E:/py/crawler/html/"+str(count)+"error"+".html","w")
 67          f.write("this page empty")
 68          f.close()
 69          
 70  def BFS(firstUrl):
 71      global htmlcount
 72      global maxcount
 73      allUrls.add(firstUrl)
 74      UrlsQlist = list(allUrls)
 75      while htmlcount < maxcount : #数量小于最大值
 76          tempUrl = UrlsQlist.pop(0)# the queue
 77          myWebStr = gethtml(tempUrl)
 78          savehtml(myWebStr,htmlcount)
 79          savemails(myWebStr)
 80          firstUrls_set = geturls(myWebStr)#初始页面的处理
 81          if firstUrls_set != None:
 82              allUrls.update(firstUrls_set) #记录全部 url
 83              for u in firstUrls_set:
 84                  if u not in UrlsQlist:
 85                      UrlsQlist.append(u)       
 86          htmlcount = htmlcount + 1
 87          
 88          
 89  BFS(url)
 90  for u in allMails:
 91      try:
 92          fmails.write(u)
 93          fmails.write('\n')
 94      except:
 95          continue
 96  for u in allUrls:
 97      try:
 98          furls.write(u)
 99          furls.write('\n')
100      except:
101          continue
102  fmails.close()
103  furls.close()

 


2013.5.13 update

本来想在加个多线程。。。。结果看了 好多资料 无处下手,再研究研究 ,日后再改

加了点 url规范化。代码整理如下:

  1 import urllib2
  2 import re
  3 from pyquery import PyQuery as pq
  4 from lxml import etree
  5 import urlparse
  6 import time
  7 
  8 allUrls = set()
  9 allMails = set()
 10 urlsDownlist = []
 11 
 12 class mailCrawler:
 13     def __init__(self,mailExpression,start_url,maxcount):   
 14         ''' mailExpressoin 邮箱的正则表达式;
 15         start_url开始邮箱;
 16         maxcount最大数量'''
 17         self.mailpattern = re.compile(mailExpression)
 18         self.maxcount = maxcount
 19         self.htmlcount = 0
 20         self.UrlsQlist = []#url queue 实现广度优先
 21         self.url = start_url
 22 
 23     
 24     def url_normal(self,url):
 25         '''url 规范化 '''
 26         scheme,netloc,path,query = urlparse.urlsplit(url)[:4]
 27         netloc = netloc.lower()
 28 
 29         url.encode("utf-8")
 30 
 31         if path:
 32             path = re.sub('/{2,}','/',path)#去除url中的重复/
 33             path = re.sub(r'\.$','',path)#去除url中结尾多余的点
 34             path = re.sub('/$','',path)#去除url中结尾多余的/
 35             path = re.sub('\s','',path)#取出url中的空格
 36         if query:
 37             return '%s://%s%s?%s' % (scheme,netloc,path or '/',query)
 38         else:
 39             return '%s://%s%s' % (scheme,netloc,path)
 40 
 41     def geturls(self,data):
 42         '''解析html中的url'''
 43         urls = set()
 44         if data:  
 45             d = pq(data)
 46             label_a = d.find('a')#用pyquery库去找到 a 标签.
 47             if label_a:
 48                 label_a_href = d('a').map(lambda i,e:pq(e)('a').attr('href'))
 49                 for u in label_a_href:
 50                     if u[0:10]!="javascript" and u[0:6]!="mailto" :  
 51                         if u[0:4] == "http":
 52                             normal_url = self.url_normal(u)
 53                             urls.add(normal_url)
 54                         else:
 55                             normal_url = self.url_normal(self.url + u)
 56                             urls.add(normal_url)              
 57             return urls
 58         else:
 59             return None
 60         
 61     def gethtml(self,url):
 62         '''下载html  5s超时'''
 63         try:
 64             fp = urllib2.urlopen(url,None,5)
 65         except:
 66             print "urllib2.urlopen error  or timeout"
 67             return None
 68         else:
 69             mybytes =fp.read()
 70             fp.close()
 71             return mybytes
 72         
 73     def savemails(self,data):
 74         '''将抓取到的url存放到 allmails中 ,set去重复'''
 75         global allMails
 76         if data:
 77             mailResult = self.mailpattern.findall(data)
 78             mailResultset = set(mailResult)
 79             if mailResultset:
 80                 allMails.update(mailResultset)
 81             
 82     def savehtml(self,pagecontent,htmlcount,url):
 83         '''保存html文件 '''
 84         if pagecontent != None:
 85             f = open("E:/py/crawler/html/"+str(htmlcount)+".html","w")
 86             f.write(pagecontent)
 87             f.close()
 88         else:
 89             f = open("E:/py/crawler/html/"+str(htmlcount)+"error"+".html","w")
 90             try:
 91                 f.write(url)
 92             except:
 93                 f.write("encode error")
 94             f.close()
 95             
 96     def BFS(self):
 97         '''用队列实现广度优先,爬取url '''
 98         global allUrls
 99         global urlsDownlist
100         allUrls.add(self.url)
101         self.UrlsQlist = list(allUrls)
102         while self.htmlcount < self.maxcount : #数量小于最大值
103             tempUrl = self.UrlsQlist.pop(0)# the queue
104             print tempUrl
105             urlsDownlist.append(tempUrl)
106             myWebStr = self.gethtml(tempUrl)
107             self.savehtml(myWebStr,self.htmlcount,tempUrl)
108             self.savemails(myWebStr)
109             firstUrls_set = self.geturls(myWebStr)#初始页面的处理
110             if firstUrls_set != None:
111                 for u in firstUrls_set:
112                     if u not in allUrls:
113                         allUrls.add(u)
114                         self.UrlsQlist.append(u)       
115             self.htmlcount = self.htmlcount + 1
116             
117 
118 def main():
119     reg = r'[A-Za-z0-9_]+@(?:[A-Za-z0-9]+\.)+[A-Za-z]+'
120     url = "http://www.baidu.com"
121     count = 100
122     fmails = open("E:/py/crawler/mailresult.txt","a")
123     furls = open("E:/py/crawler/urlresult.txt","a")
124     fdownUrls = open("E:/py/crawler/urlDownresult.txt","a")
125     newcrawler = mailCrawler(reg,url,count)
126     newcrawler.BFS()
127     for u in allMails:
128         try:
129             fmails.write(u)
130             fmails.write('\n')
131         except:
132             continue
133     for u in allUrls:
134         try:
135             furls.write(u)
136             furls.write('\n')
137         except:
138             continue
139     for u in urlsDownlist:
140         try:
141             fdownUrls.write(u)
142             fdownUrls.write('\n')
143         except:
144             continue
145     fmails.close()
146     furls.close()
147     fdownUrls.close()
148 
149 if __name__ == '__main__':
150     main()

 

 

posted @ 2013-05-02 21:27  细胞核  阅读(2089)  评论(0编辑  收藏  举报