欢迎来到yzayr的博客

你不一定逆风翻盘,但一定要向阳而生。
扩大
缩小

数据采集与融合技术第五次实验

作业①

一、实验内容

  • 要求:

    • 熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容。
    • 使用Selenium框架爬取京东商城某类商品信息及图片。
  • 候选网站:http://www.jd.com/

  • 关键词:自由选择

  • 输出信息:MYSQL的输出信息如下

    mNo mMark mPrice mNote mFile
    000001 三星Galaxy 9199.00 三星Galaxy Note20 Ultra 5G... 000001.jpg
    000002......

二、代码实现

1、导入所需的包

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import urllib.request
import threading
import pymysql
import os
import datetime
from selenium.webdriver.common.keys import Keys
import time

2、创建MySpider类

class MySpider:
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
    }  # 设置请求头

    imagePath = "download"  # 图片下载路径
    page = 1  # 爬取页数

    def startUp(self, url, key):
        # # Initializing Chrome browser
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        self.driver = webdriver.Chrome(chrome_options=chrome_options)

        # Initializing variables
        self.threads = []
        self.No = 0

        # Initializing database
        try:
            self.con = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="cyz20010726", db="datamining", charset="utf8")
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            try:
                # 如果有表就删除
                self.cursor.execute("delete from phones")
            except:
                pass

        except Exception as err:
            print(err)
        # Initializing images folder
        try:
            if not os.path.exists(MySpider.imagePath):
                os.mkdir(MySpider.imagePath)
            images = os.listdir(MySpider.imagePath)
            for img in images:
                s = os.path.join(MySpider.imagePath, img)
                os.remove(s)
        except Exception as err:
            print(err)

        self.driver.get(url)
        keyInput = self.driver.find_element_by_id("key")
        keyInput.send_keys(key)
        keyInput.send_keys(Keys.ENTER)

    def closeUp(self):
        try:
            self.con.commit()
            self.con.close()
            self.driver.close()

        except Exception as err:
            print(err);

    def download(self, src1, src2, mFile):
        data = None
        if src1:
            try:
                req = urllib.request.Request(src1, headers=MySpider.headers)
                resp = urllib.request.urlopen(req, timeout=10)
                data = resp.read()
            except:
                pass
        if not data and src2:
            try:
                req = urllib.request.Request(src2, headers=MySpider.headers)
                resp = urllib.request.urlopen(req, timeout=10)
                data = resp.read()
            except:
                pass
        if data:
            print("download begin", mFile)
            fobj = open(MySpider.imagePath + "\\" + mFile, "wb")
            fobj.write(data)
            fobj.close()
            print("download finish", mFile)

    def processSpider(self):
        try:
            time.sleep(1)
            print(self.driver.current_url)
            lis = self.driver.find_elements_by_xpath("//div[@id='J_goodsList']//li[@class='gl-item']")
            for li in lis:
                # We find that the image is either in src or in data-lazy-img attribute
                try:
                    src1 = li.find_element_by_xpath(".//div[@class='pimg']//a//img").get_attribute("src")
                except:
                    src1 = ""

                try:
                    src2 = li.find_element_by_xpath(".//div[@class='pimg']//a//img").get_attribute("data-lazy-img")
                except:
                    src2 = ""

                try:
                    price = li.find_element_by_xpath(".//div[@class='p-price']//i").text
                except:
                    price = "0"

                try:
                    note = li.find_element_by_xpath(".//div[@class='p-name p-name-type-2']//em").text
                    mark = note.split(" ")[0]
                    mark = mark.replace("爱心东东\n", "")
                    mark = mark.replace(",", "")
                    note = note.replace("爱心东东\n", "")
                    note = note.replace(",", "")

                except:
                    note = ""
                    mark = ""

                self.No = self.No + 1
                no = str(self.No)
                while len(no) < 6:
                    no = "0" + no
                print(no, mark, price)
                if src1:
                    src1 = urllib.request.urljoin(self.driver.current_url, src1)
                    p = src1.rfind(".")
                    mFile = no + src1[p:]
                elif src2:
                    src2 = urllib.request.urljoin(self.driver.current_url, src2)
                    p = src2.rfind(".")
                    mFile = no + src2[p:]
                if src1 or src2:
                    T = threading.Thread(target=self.download, args=(src1, src2, mFile))
                    T.setDaemon(False)
                    T.start()
                    self.threads.append(T)
                else:
                    mFile = ""
                self.cursor.execute("insert into phones (mNo, mMark, mPrice, mNote, mFile) values (%s, %s, %s, %s, %s)",
                                    (no, mark, price, note, mFile))
            # 取下一页的数据,直到最后一页
            try:
                self.driver.find_element_by_xpath("//span[@class='p-num']//a[@class='pn-next disabled']")
            except:
                if MySpider.page < 6:  # 爬取6页数据
                    MySpider.page += 1
                    nextPage = self.driver.find_element_by_xpath("//span[@class='p-num']//a[@class='pn-next']")
                    time.sleep(10)
                    nextPage.click()
                    self.processSpider()
        except Exception as err:
            print(err)

    def executeSpider(self, url, key):
        starttime = datetime.datetime.now()
        print("Spider starting......")
        self.startUp(url, key)
        print("Spider processing......")
        self.processSpider()
        print("Spider closing......")
        self.closeUp()
        for t in self.threads:
            t.join()
        print("Spider completed......")
        endtime = datetime.datetime.now()
        elapsed = (endtime - starttime).seconds
        print("Total ", elapsed, " seconds elapsed")

3、运行程序

url = "http://www.jd.com"
spider = MySpider()
spider.executeSpider(url, "手机")

4、运行结果


三、心得体会

这道题主要是复现书上的代码,让我对selenium框架有了更深的理解。

附:完整代码链接

作业②

一、实验内容

  • 要求:

    • 熟练掌握 Selenium 查找HTML元素、实现用户模拟登录、爬取Ajax网页数据、等待HTML元素等内容。
    • 使用Selenium框架+MySQL模拟登录慕课网,并获取学生自己账户中已学课程的信息保存到MySQL中(课程号、课程名称、授课单位、教学进度、课程状态,课程图片地址),同时存储图片到本地项目根目录下的imgs文件夹中,图片的名称用课程名来存储。
  • 候选网站:中国mooc网:https://www.icourse163.org

  • 输出信息:MYSQL数据库存储和输出格式

    表头应是英文命名例如:课程号ID,课程名称:cCourse……,由同学们自行定义设计表头:

    Id cCourse cCollege cSchedule cCourseStatus cImgUrl
    1 Python网络爬虫与信息提取 北京理工大学 已学3/18课时 2021年5月18日已结束 http://edu-image.nosdn.127.net/C0AB6FA791150F0DFC0946B9A01C8CB2.jpg
    2......

二、代码实现

1、导入所需要的包

import os
import threading

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pymysql
import datetime
import time
import urllib.request

2、创建MySpider类

class MySpider:
    # 设置请求头
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
    }
    flag = 1  # 爬取MOOC课程为1,爬取SPOT课程为0
    imagePath = "download_2"  # 图片保存路径

    def startUp(self, url):
        # # Initializing Chrome browser
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        self.driver = webdriver.Chrome(chrome_options=chrome_options)

        # Initializing variables
        self.No = 0
        self.threads = []
        # Initializing database
        try:
            self.con = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="cyz20010726", db="datamining", charset="utf8")
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            try:
                # 如果有表就删除
                self.cursor.execute("delete from courses")
            except:
                pass
            try:
                # 建立新的表
                sql = "create table courses (cId int primary key, cCourse varchar(256), cCollege varchar(256), cSchedule varchar(256), cCourseStatus varchar(256), cImgUrl varchar(256))"
                self.cursor.execute(sql)
            except:
                pass
        except Exception as err:
            print(err)

        # Initializing images folder
        try:
            if not os.path.exists(MySpider.imagePath):
                os.mkdir(MySpider.imagePath)
            images = os.listdir(MySpider.imagePath)
            for img in images:
                s = os.path.join(MySpider.imagePath, img)
                os.remove(s)
        except Exception as err:
            print(err)

        self.driver.get(url)
        self.driver.find_element_by_xpath("//div[@class='_1Y4Ni']/div[@class='_3uWA6']").click()  # 找到登录/注册按钮
        time.sleep(2)
        
        self.driver.find_element_by_xpath("//div[@class='ux-login-set-scan-code_ft']/span").click()  # 其他方式登录
        time.sleep(2)
        
        self.driver.find_element_by_xpath("//ul[@class='ux-tabs-underline_hd']//li[2]").click()  # 手机号登录
        time.sleep(2)
        
        self.driver.switch_to.frame(self.driver.find_elements_by_tag_name("iframe")[1].get_attribute('id'))
        input_tel = self.driver.find_element_by_xpath("//form[@id='login-form']//div[@class='inputbox']//div[@class='u-input box']//input[@id='phoneipt']")  # 手机号输入位置
        input_password = self.driver.find_element_by_xpath("//form[@id='login-form']//div[@class='u-input box']//input[@class='j-inputtext dlemail']")  # 密码输入位置
        input_tel.send_keys("xxxxxxxxx")  # 输入手机号
        time.sleep(2)
        input_password.send_keys("xxxxxxxxxx")  # 输入密码
        time.sleep(2)
        
        self.driver.find_element_by_xpath("//a[@id='submitBtn']").click()  # 登录按钮
        time.sleep(2)
        
        myCourse = self.driver.find_element_by_xpath("//div[@id='j-indexNav-bar']//div[@class='u-navLogin-myCourse-t']//a").get_attribute("href")  # 个人中心
        self.driver.get(myCourse)

    def download(self, src, mFile):
        data = None
        if src:
            try:
                req = urllib.request.Request(src, headers=MySpider.headers)
                resp = urllib.request.urlopen(req, timeout=10)
                data = resp.read()
            except:
                pass
        if data:
            print("download begin", mFile)
            fobj = open(MySpider.imagePath + "\\" + mFile, "wb")
            fobj.write(data)
            fobj.close()
            print("download finish", mFile)

    def closeUp(self):
        try:
            self.con.commit()
            self.con.close()
            self.driver.close()

        except Exception as err:
            print(err)

    def processSpider(self):
        try:
            time.sleep(1)
            divs = self.driver.find_elements_by_xpath("//div[@class='course-panel-body-wrapper']/div[@class='course-card-wrapper']")  # 课程表
            for div in divs:
                self.No += 1
                try:
                    name = div.find_element_by_xpath(".//div[@class='body']//span[@class='text']").text
                except Exception:
                    name = ""
                try:
                    college = div.find_element_by_xpath(".//div[@class='school']//a").text
                except Exception:
                    college = ""
                try:
                    schedule = div.find_element_by_xpath(".//div[@class='personal-info']//span[@class='course-progress-text-span']").text
                except Exception:
                    schedule = ""
                try:
                    status = div.find_element_by_xpath(".//div[@class='course-status']").text
                except:
                    status = ""
                try:
                    img = div.find_element_by_xpath(".//div[@class='img']/img").get_attribute("src")
                except:
                    img = ""
                if img:
                    # 下载图片并重命名
                    mFile = str(self.No) + '.jpg'
                    src = urllib.request.urljoin(self.driver.current_url, img)
                    T = threading.Thread(target=self.download, args=(src, mFile))
                    T.setDaemon(False)
                    T.start()
                    self.threads.append(T)
                else:
                    mFile = ""
                self.cursor.execute(
                    "insert into courses (cId, cCourse, cCollege, cSchedule, cCourseStatus, cImgUrl) values (%s, %s, %s, %s, %s, %s)",
                    (self.No, name, college, schedule, status, mFile))
            if MySpider.flag:  # flag = 1表示爬完MOOC课程
                MySpider.flag = 0  # 爬取SPOT课程
                spot_url = self.driver.find_element_by_xpath("//div[@id='j-module-tab']//div[@class='item u-st-spoc-course   ga-click']//a")  # SPOT课程
                spot_url.click()
                self.processSpider()
        except Exception as err:
            print(err)

    def executeSpider(self, url):
        starttime = datetime.datetime.now()
        print("Spider starting......")
        self.startUp(url)
        print("Spider processing......")
        self.processSpider()
        print("Spider closing......")
        self.closeUp()
        print("Spider completed......")
        endtime = datetime.datetime.now()
        elapsed = (endtime - starttime).seconds
        print("Total ", elapsed, " seconds elapsed")

3、运行程序

url = "https://www.icourse163.org/"
spider = MySpider()
spider.executeSpider(url)

4、运行结果


三、心得体会

这道题还是利用selenium框架爬取数据,不一样的是需要模拟登录,开始的时候由于MOOC在维护,无法登录。之后服务器修复好再次登录的时候,发现找不到手机号和密码输入的位置,找了资料说要使用self.driver.switch_to.frame(self.driver.find_elements_by_tag_name("iframe")[1].get_attribute('id')),但是还是没研究出所以然来。使用之后发现还是登录不进去,卡了好久发现自己定位手机号登录定位到邮箱登录了。。。经过了一波三折,最后还是成功爬取到所需信息。

附:完整代码链接

作业③:Flume日志采集实验

要求:掌握大数据相关服务,熟悉Xshell的使用

一、环境搭建

任务一:开通MapReduce服务

二、实时分析开发实战:

任务一:Python脚本生成测试数据

  1. 编写Python脚本


  2. 创建目录

  3. 测试执行

任务二:配置Kafka

  1. 设置环境变量
  2. 在kafka中创建topic

任务三:安装Flume客户端

  1. 打开flume服务界面
  2. 点击下载客户端
  3. 解压下载的flume客户端文件
  4. 校验文件包
  5. 解压“MRS_Flume_ClientConfig.tar”文件
  6. 安装Flume环境变量
  7. 解压Flume客户端
  8. 安装Flume客户端
  9. 重启Flume服务

任务四:配置Flume采集数据

  1. 修改配置文件
  2. 创建消费者消费kafka中的数据

三、心得体会

初次使用华为云,有点陌生,不过一些命令还是跟linux一样,所以用起来也不是很生疏,同时也为后面的实验做铺垫。

posted on 2021-12-08 00:07  yzayr  阅读(26)  评论(0编辑  收藏  举报

导航