爬虫练习

爬虫小项目

0、爬取大学排名

import bs4
import requests
from bs4 import BeautifulSoup


# 通过传入网址信息创建一个获取网页文本的函数
def getHTMLText(url):
    # 判断获取网页文本过程中是否有错误
    try:
        # 打开网址获取文本,并且把延迟设置成30s
        r = requests.get(url, timeout=30)
        # 获取状态码
        r.raise_for_status()
        # 设置文件编码
        r.encoding = r.apparent_encoding
        # 如果成功返回网页文本
        return r.text
    except:
        # 获取网页文本有错误则返回空文本
        return ""


# 通过传入空列表和网页文本信息创建一个在大学列表中加入大学信息的函数
def fillUnivList(ulist, html):
    # 用BeautifulSoup将网页文本以’html.parser‘煮成一锅粥
    soup = BeautifulSoup(html, "html.parser")
    # 通过网页源代码我们可以发现我们需要的信息都在tbody标签内,因此我们循环找出’tbody‘标签及其子标签的内容
    for tr in soup.find('tbody').children:
        # 通过bs4.element.Tag判断是否为tr标签
        if isinstance(tr, bs4.element.Tag):
            # 对于tr标签的我们拿到tr标签里的td标签
            tds = tr('td')
            # [<td>1</td>, <td><div align="left">清华大学</div></td>, <td>北京</td>, <td>95.3</td>...
            # 我们通过筛选出我们需要的td标签中的文本并将其用列表的方式加入我们传入的列表ulist中
            ulist.append([tds[0].string, tds[1].string,
                          tds[2].string, tds[3].string])


# 通过传入学校列表信息创建一个打印大学列表的函数
def printUnivList(ulist,province):
    # 打印标题
    print("中国最好大学排名2018({}地区)".center(45, '-').format(province))
    # 设置一个format格式化的模板
    # 注意:这里的{4}是因为utf8格式的英文和中文字节数不同,python会自动用英文来填
    tplt = "{0:^10}\t{1:{4}^10}\t{2:^10}\t{3:^10}"
    # 充空白位置,我们用chr(12288)将其修改成用中文填充空白位置
    # 打印第一行
    print(tplt.format("排名", "学校名称", "地区", "总分", chr(12288)))
    if province == '安徽':
        print(tplt.format(1, '安徽师范大学花津校区', '安徽', 99.9, chr(12288)))
    # 循环取出列表中的每一所大学的信息,取出的大学信息是列表的形式(可以控制range(len(ulist))的长度来控制想要打印的学校的数量)
    for i in range(len(ulist)):
        # 将每一所大学的信息以列表的形式赋值给u
        u = ulist[i]
        # u[2]是地区,判断是否为安徽地区(可以自己更改地区信息,如果删除该判断,打印所有学校信息,也可以更改判断条件)
        if u[2] == province:
            # 如果为安徽地区,我们打印属于安徽地区的每一所大学的信息
            print(tplt.format(u[0], u[1], u[2], u[3], chr(12288)))


# 创建一个运行函数
def main(province='安徽'):
    # 创建一个空列表,为填充大学信息列表做准备
    uinfo = []
    # 定义一个想要爬取的网页
    url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2018.html'
    # 传入想要爬取的网页获取该网页文本信息
    html = getHTMLText(url)
    # 给填充大学信息函数传值
    fillUnivList(uinfo, html)
    # 给打印大学信息函数传值
    printUnivList(uinfo,province=province)


main()
# main(province='北京')

1、爬取豆瓣250

import requests
import time
from openpyxl import Workbook
from bs4 import BeautifulSoup

wb = Workbook()
sheet = wb.active
count = 1
for i in range(0,100,25):
    ret = requests.get('https://movie.douban.com/top250?start=%s&filter='%(i))
    bs = BeautifulSoup(ret.text,'html.parser')
    ol = bs.find(name='ol',attrs={'class':'grid_view'})
    li_list = ol.find_all(name='li')
    sheet.title = '好评电影'
    sheet['A1'].value = '序号'
    sheet['B1'].value = '电影名称'
    sheet['C1'].value = '电影评分'
    sheet['D1'].value = '电影链接'
    sheet['E1'].value = '电影图片'
    for li in li_list:
        name = li.find(name='span',attrs={'class':'title'})
        a = li.find(name='a')
        span = li.find(name='span',attrs={'class':'rating_num'})
        img = a.find(name='img')
        count += 1
        sheet['A%s'%(count)].value = count - 1
        sheet['B%s'%(count)].value = name.text
        sheet['C%s'%(count)].value = span.text
        sheet['D%s'%(count)].value = a['href']
        sheet['E%s'%(count)].value = img['src']
    time.sleep(1)
wb.save('好评电影.xlsx')

2、爬取汽车之家

import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook


def run(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
    }
    response = requests.get(url,headers=headers)
    response.encoding = 'gbk'
    soup = BeautifulSoup(response.text,'html.parser')
    # 获取ul
    ul = soup.find(name='ul',attrs={"class":"article"})
    # 获取所有的li
    li_list = ul.find_all(name='li')
    infos = []
    for li in li_list:
        name = li.find(name="h3")
        name1 = ""
        if name:
            name1 = (name.text)
        href = li.find(name='a')
        href1 = ""
        if href:
            href1 = ('http:'+href['href'])
        info = li.find(name='p')
        info1 = ""
        if info:
            info1 = (info.text)
        infos.append({"title":name1,"href":href1,"info":info1})
    print(infos)

if __name__ == '__main__':
    url = 'https://www.autohome.com.cn/news/'
    run(url)

3、爬取斗图表情包

import requests
from bs4 import BeautifulSoup
ret = requests.get('https://www.doutula.com/photo/list?page=0')
bs = BeautifulSoup(ret.text,'html.parser')
div = bs.find(name='div',attrs={'class':'page-content text-center'})

a_list = div.find_all(name='a')
for a in a_list:
    img = a.find(name='img')
    img_name = img.get('alt')
    img_url = img.get('data-backup')

    if img_name and img_url:
           # print(img_name)
           # print(img_url)
           f = open('表情包/%s.jpg'%(img_name),'wb')
           ret_img = requests.get(img_url)
           f.write(ret_img.content)        

4、爬取梨视频

import requests
import re
from bs4 import BeautifulSoup

ret = requests.get('https://www.pearvideo.com/')
print(ret.text)

bs = BeautifulSoup(ret.text,'html.parser')
div_list = bs.find_all(name='div',attrs={'class':'vervideo-tbd'})

num = 0
for div in div_list:
    a = div.find(name='a')
    video_url = 'https://www.pearvideo.com/' + a.get('href')
    video_ret = requests.get(video_url)

    mp4_url = re.search('(https:\/\/)[^\s]+mp4',video_ret.text).group()
    print(mp4_url)
    mp4_ret = requests.get(mp4_url)
    f = open('梨视频%s.mp4'%(num),'wb')
    f.write(mp4_ret.content)
    num += 1

实现在线翻译功能

import requests
import json
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}


def main(keys=''):
    url = 'http://fy.iciba.com/ajax.php?a=fy'
    data = {
        'f': 'auto',
        't': 'auto',
        'w': keys
    }
    response = requests.post(url,headers=headers,data=data)
    info = response.text
    data_list = json.loads(info)
    try:
        val = data_list['content']['word_mean'] # 中文转英文
    except:
        val = data_list['content']['out']  # 英文转中文
    return val

if __name__ == '__main__':
    keys = input('请输入需要翻译的英文或者中文...')
    if not keys:
          print('请您正确输入需要翻译的中文或者英文...')
    else:
        data = main(keys)
        print(data)

selenium小项目

开胃菜

# 百度搜索老男孩
from selenium import webdriver
# 打开浏览器
b = webdriver.Chrome()
# 请求百度
b.get('https://www.baidu.com')
# 找到百度的input输入框的标识符 id:kw
ele = b.find_element_by_id('kw')
# 清除输入框信息
ele.clear()
# 输入 老男孩 
ele.send_keys('老男孩')
# 查找点击按钮节点
su = b.find_element_by_id('su')
# 点击按钮
su.click()

爬取京东商城

from selenium import webdriver
from selenium.webdriver.common.keys import Keys  # 键盘按键操作
import time

def get_goods(driver):
    try:
        goods = driver.find_elements_by_class_name('gl-item')

        for good in goods:
            detail_url = good.find_element_by_tag_name('a').get_attribute('href')

            p_name = good.find_element_by_css_selector('.p-name em').text.replace('\n','')
            price = good.find_element_by_css_selector('.p-price i').text
            p_commit = good.find_element_by_css_selector('.p-commit a').text
            msg = '''
            商品 : %s
            链接 : %s
            价钱 :%s
            评论 :%s
            ''' % (p_name, detail_url, price, p_commit)

            print(msg, end='\n\n')

        button = driver.find_element_by_partial_link_text('下一页')
        button.click()
        time.sleep(1)
        get_goods(driver)
    except Exception:
        pass


def spider(url, keyword):
    driver = webdriver.Chrome()
    driver.get(url)
    driver.implicitly_wait(3)  # 使用隐式等待
    try:
        input_tag = driver.find_element_by_id('key')
        input_tag.send_keys(keyword)
        input_tag.send_keys(Keys.ENTER)
        get_goods(driver)
    finally:
        driver.close()


if __name__ == '__main__':
    spider('https://www.jd.com/', keyword='华为P30')

爬虫与数据分析之雨女无瓜

import requests
from bs4 import BeautifulSoup
import datetime
import pandas as pd
import matplotlib.pyplot as plt
import re
import jieba
import numpy as np
from scipy.misc import imread
from wordcloud import WordCloud, ImageColorGenerator

url = "https://comment.bilibili.com/92542241.xml"
r = requests.get(url)
r.encoding = 'utf8'


soup = BeautifulSoup(r.text,'lxml')
d = soup.find_all('d')

dlst = []
n = 0
for i in d:
    n += 1
    danmuku = {}
    danmuku['弹幕'] = i.text
    danmuku['网址'] = url
    danmuku['时间'] = datetime.date.today()
    dlst.append(danmuku)

df = pd.DataFrame(dlst)

with open('sign.txt','w',encoding='utf8') as f:
    for text in df['弹幕'].values:
        pattern = re.compile(r'[一-龥]+')
        filter_data = re.findall(pattern,text)
        f.write("".join(filter_data))

with open('sign.txt', 'r', encoding='utf8') as f:
    data = f.read()
    segment = jieba.lcut(data)
    words_df = pd.DataFrame({"segment": segment})

word_stat = words_df.groupby(by=['segment'])['segment'].agg({'计数':np.size})
words_stat = word_stat.reset_index().sort_values(by=['计数'],ascending=False)

color_mask = imread('01.jpg')

wordcloud = WordCloud(
    # font_path="simhei.ttf",   # mac上没有该字体
    font_path="C:\Windows\Fonts\simkai.ttf",
    # 设置字体可以显示中文
    background_color="white",  # 背景颜色
    max_words=3000,  # 词云显示的最大词数
    mask=color_mask,  # 设置背景图片
    max_font_size=200,  # 字体最大值
    random_state=100,
    width=1000, height=860, margin=2,
    # 设置图片默认的大小,但是如果使用背景图片的话,                                                   # 那么保存的图片大小将会按照其大小保存,margin为词语边缘距离
)

# 生成词云, 可以用generate输入全部文本,也可以我们计算好词频后使用generate_from_frequencies函数
word_frequence = {x[0]: x[1] for x in words_stat.head(500).values}
word_frequence_dict = {}
for key in word_frequence:
    word_frequence_dict[key] = word_frequence[key]

wordcloud.generate_from_frequencies(word_frequence_dict)
# 从背景图片生成颜色值
# image_colors = ImageColorGenerator(color_mask)
# 重新上色
# wordcloud.recolor(color_func=image_colors)
# 保存图片
wordcloud.to_file('output.png')
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

 

posted @ 2019-07-03 23:59  Sean_Yang  阅读(1399)  评论(4编辑  收藏  举报