爬虫

import requests
def gethtmltext(url):
     try:
         r=requests.get(url,timeout=30)
         r.raise_for_status()
         r.encoding='utf-8'
         return r.text
     except:
        return ""
 
 url="https://www.baidu.com"
 for i in range(20):
     print(gethtmltext(url))

 

获取网页

html = '''<!DOCTYPE html>
  <html>
  <head>
  <meta charset="utf-8">
  <title>菜鸟教程(runoob.com)</title>
  </head>
 <body>
     <h1>我的第一个标题</h1>
      <p>我的第一个段落。</p>
 </body>
 </html>'''
 
 import re
 def getChinese(html):
     html_unicode=html.strip()
     string=re.compile('[^\u4e00-\u9fff]')
     chinese="".join(string.split(html_unicode))
     return chinese
 from bs4 import BeautifulSoup
 soup=BeautifulSoup(html)
 print("获取head标签内容:")
 print(soup.head)
 print("学号号数为11")
 print()
 print("获取body标签内容:")
 print(soup.body)
 print()
 print(soup.title)
 print()
 print("获取html中的中文字符")
 print(getChinese(html))

 

 

大学

import requests
  from bs4 import BeautifulSoup
  import pandas
  def getHTMLText(url):
      try:
         r=requests.get(url,timeout=30)
        r.raise_for_status()
         r.encoding='utf-8'
         return r.text
     except:
         return ""
 
 def fillUnivList(soup):
     t_list=[]
     data=soup.find_all('tr')
     for tr in data:
         ltd=tr.find_all('td')
         if len(ltd)==0:
             continue
         singleUniv=[]
         for td in ltd:
             singleUniv.append(td.string)
         t_list.append(singleUniv)
     return t_list
 
 def printUnivList(num,t_list):
     print("{1:^2}{2:{0}^10}{3:{0}^6}{4:{0}^4}".format(chr(12288),"排名","学校名称","省市","总分"))
     for i in range(num):
         u=t_list[i]
        print("{1:^4}{2:{0}^10}{3:{0}^5}{4:{0}^8}".format(chr(12288),u[0],u[1],u[2],eval(u[3])))
 
 def saveCSV(file_name,t_list):
     FormData=pandas.DataFrame(t_list)
     FormData.columns=["排名","学校名称","省市","总分","生源质量","培养成果","人才培养得分"]
     FormData.to_csv(file_name,encoding='utf-8',index=False)
  
 def main(num):
     url='http://www.zuihaodaxue.com/zuihaodaxuepaiming2020.html'
     html=getHTMLText(url)
     soup=BeautifulSoup(html,"html.parser")
     data=fillUnivList(soup)
     printUnivList(num,data)
     saveCSV("E:\\daxuepaiming_data.csv",data)
 
 main(10)

 

 

posted @ 2020-12-13 23:13  Zhhou  阅读(90)  评论(0)    收藏  举报