爬虫


import
urllib.request url = "https://www.sogou.com/" response = urllib.request.urlopen(url) content = response.read().decode('utf-8') for i in range(20): print(content)
(以下仅为一部分结果截图):

import requests
def gethtmltext(url):
    try:
        r=requests.get(url,timeout=30)
        r.raise_for_status()
        r.encoding='utf-8'
        return len(r.text),len(r.content)
    except:
        return ""
 
url="https://www.sogou.com/"
print(gethtmltext(url))

 (3)

html = '''<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>菜鸟教程(runoob.com)</title>
</head>
<body>
<h1>我的第一个标题</h1>
<p id="first">我的第一个段落。</p>
</body>
<table border="1">
<tr>
<td>row 1, cell 1</td>
<td>row 1, cell 2</td>
</tr>
<tr>
<td>row 2, cell 1</td>
<td>row 2, cell 2</td>
</tr>
</table>
</html>'''
import re
def getChinese(html):
    html_unicode=html.strip()
    string=re.compile('[^\u4e00-\u9fff]')
    chinese="".join(string.split(html_unicode))
    return chinese
from bs4 import BeautifulSoup
soup=BeautifulSoup(html)
print("获取head标签内容:")
print(soup.head)
print("学号后两位:04")
print()
print("获取body标签内容:")
print(soup.body)
print()
print("id为first的标签对象:")
print(soup.p)
print()
print("获取html中的中文字符")
print(getChinese(html))

 

import requests
from bs4 import BeautifulSoup
import bs4
def getHTMLText(url):
    try:
        r=requests.get(url,timeout=30)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        return r.text
    except:
        return ""
def fillUnivList(ulist,html):
    soup=BeautifulSoup(html,"html.parser")
    for tr in soup.find('tbody').children:
        if isinstance(tr,bs4.element.Tag):
            tds=tr('td')
            ulist.append([tds[0].string,tds[1].string,tds[2].string,tds[3].string,tds[4].string])            
def printUnivList(ulist,num):
    print("{0:^10}\t{1:{5}^10}\t{2:{5}^10}\t{3:^10}\t{4:^10}".format("排名","学校名称","省份","总分","生源质量",chr(12288)))
    for i in range(num):
        u=ulist[i]
        print("{0:^10}\t{1:{5}^10}\t{2:{5}^10}\t{3:^10}\t{4:^10}".format(u[0],u[1],u[2],u[3],u[4],chr(12288)))
def main():
          uinfo=[]
          url="http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html"
          html=getHTMLText(url)
          fillUnivList(uinfo,html)
          printUnivList(uinfo,310)
 
main()
posted @ 2020-12-14 00:21  εε  阅读(96)  评论(0)    收藏  举报