第一个爬虫和测试

import requests
for i in range(0,20):
  r=requests.get("https://www.so.com//")
print(r.status_code)
print(r.text)
print(type(r.text))
print(type(r.content))
print(len(r.content))

from bs4 import BeautifulSoup
import re
html="""
<!DOCTYPE html>
<head>
<meta charset="utf-8">
<title>菜鸟教程(runoob.com)</title>
</head>
<body>
    <h1>我的第一个标题</h1>
    <p id="first">我的第一个段落</p>
</body>
        <table border="1">
    <tr>
        <td>row 1, cell 1</td>
        <td>row 1, cell 2</td>
    </tr>
    <tr>
        <td>row 2, cell 1</td>
        <td>row 2, cell 2</td>
    </tr>
</table>
</html>
"""
abc= BeautifulSoup(html)
print(str(abc.head.string)+'\n'+'47')#打印头标签内容加上学号
print(abc.body.string)#打印body标签的内容
print(abc.find_all(id="first"))
r=abc.text
zhongwen = re.findall(u'[\u1100-\uFFFDh]+?',r)打印中文内容
print(zhongwen)
import requests
from bs4 import BeautifulSoup
import bs4
 
def getHTMLText(url):
    #爬取最好大学排名网站内容
    try:
        r = requests.get(url, timeout = 30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""
 
def fillUnivList(ulist, html):
    #将爬取的内容中的所需内容找出并存入列表
    soup = BeautifulSoup(html, "html.parser")
    for tr in soup.find('tbody').children:
        if isinstance(tr, bs4.element.Tag):
            tds = tr('td')
            ulist.append([tds[0].string, tds[1].string, tds[3].string])
 
def printUnivList(ulist, num):
    #将信息以列表的形式输出
    print("{:^10}\t{:^6}\t{:^10}".format("排名", "学校名称", "总分"))
    for i in range(num):
        u = ulist[i]
        print("{:^10}\t{:^6}\t{:^10}".format(u[0], u[1], u[2]))
 
def main():
    uinfo = []
    url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2018.html'
    html = getHTMLText(url)
    fillUnivList(uinfo, html)
    printUnivList(uinfo, 20)
 
main()

  

posted @ 2020-05-13 21:18  糖加灰先生  阅读(124)  评论(0)    收藏  举报