数据采集与融合技术第五次实验
作业①
一、实验内容
-
要求:
- 熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容。
- 使用Selenium框架爬取京东商城某类商品信息及图片。
-
候选网站:http://www.jd.com/
-
关键词:自由选择
-
输出信息:MYSQL的输出信息如下
mNo mMark mPrice mNote mFile 000001 三星Galaxy 9199.00 三星Galaxy Note20 Ultra 5G... 000001.jpg 000002......
二、代码实现
1、导入所需的包
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import urllib.request
import threading
import pymysql
import os
import datetime
from selenium.webdriver.common.keys import Keys
import time
2、创建MySpider类
class MySpider:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
} # 设置请求头
imagePath = "download" # 图片下载路径
page = 1 # 爬取页数
def startUp(self, url, key):
# # Initializing Chrome browser
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
self.driver = webdriver.Chrome(chrome_options=chrome_options)
# Initializing variables
self.threads = []
self.No = 0
# Initializing database
try:
self.con = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="cyz20010726", db="datamining", charset="utf8")
self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
try:
# 如果有表就删除
self.cursor.execute("delete from phones")
except:
pass
except Exception as err:
print(err)
# Initializing images folder
try:
if not os.path.exists(MySpider.imagePath):
os.mkdir(MySpider.imagePath)
images = os.listdir(MySpider.imagePath)
for img in images:
s = os.path.join(MySpider.imagePath, img)
os.remove(s)
except Exception as err:
print(err)
self.driver.get(url)
keyInput = self.driver.find_element_by_id("key")
keyInput.send_keys(key)
keyInput.send_keys(Keys.ENTER)
def closeUp(self):
try:
self.con.commit()
self.con.close()
self.driver.close()
except Exception as err:
print(err);
def download(self, src1, src2, mFile):
data = None
if src1:
try:
req = urllib.request.Request(src1, headers=MySpider.headers)
resp = urllib.request.urlopen(req, timeout=10)
data = resp.read()
except:
pass
if not data and src2:
try:
req = urllib.request.Request(src2, headers=MySpider.headers)
resp = urllib.request.urlopen(req, timeout=10)
data = resp.read()
except:
pass
if data:
print("download begin", mFile)
fobj = open(MySpider.imagePath + "\\" + mFile, "wb")
fobj.write(data)
fobj.close()
print("download finish", mFile)
def processSpider(self):
try:
time.sleep(1)
print(self.driver.current_url)
lis = self.driver.find_elements_by_xpath("//div[@id='J_goodsList']//li[@class='gl-item']")
for li in lis:
# We find that the image is either in src or in data-lazy-img attribute
try:
src1 = li.find_element_by_xpath(".//div[@class='pimg']//a//img").get_attribute("src")
except:
src1 = ""
try:
src2 = li.find_element_by_xpath(".//div[@class='pimg']//a//img").get_attribute("data-lazy-img")
except:
src2 = ""
try:
price = li.find_element_by_xpath(".//div[@class='p-price']//i").text
except:
price = "0"
try:
note = li.find_element_by_xpath(".//div[@class='p-name p-name-type-2']//em").text
mark = note.split(" ")[0]
mark = mark.replace("爱心东东\n", "")
mark = mark.replace(",", "")
note = note.replace("爱心东东\n", "")
note = note.replace(",", "")
except:
note = ""
mark = ""
self.No = self.No + 1
no = str(self.No)
while len(no) < 6:
no = "0" + no
print(no, mark, price)
if src1:
src1 = urllib.request.urljoin(self.driver.current_url, src1)
p = src1.rfind(".")
mFile = no + src1[p:]
elif src2:
src2 = urllib.request.urljoin(self.driver.current_url, src2)
p = src2.rfind(".")
mFile = no + src2[p:]
if src1 or src2:
T = threading.Thread(target=self.download, args=(src1, src2, mFile))
T.setDaemon(False)
T.start()
self.threads.append(T)
else:
mFile = ""
self.cursor.execute("insert into phones (mNo, mMark, mPrice, mNote, mFile) values (%s, %s, %s, %s, %s)",
(no, mark, price, note, mFile))
# 取下一页的数据,直到最后一页
try:
self.driver.find_element_by_xpath("//span[@class='p-num']//a[@class='pn-next disabled']")
except:
if MySpider.page < 6: # 爬取6页数据
MySpider.page += 1
nextPage = self.driver.find_element_by_xpath("//span[@class='p-num']//a[@class='pn-next']")
time.sleep(10)
nextPage.click()
self.processSpider()
except Exception as err:
print(err)
def executeSpider(self, url, key):
starttime = datetime.datetime.now()
print("Spider starting......")
self.startUp(url, key)
print("Spider processing......")
self.processSpider()
print("Spider closing......")
self.closeUp()
for t in self.threads:
t.join()
print("Spider completed......")
endtime = datetime.datetime.now()
elapsed = (endtime - starttime).seconds
print("Total ", elapsed, " seconds elapsed")
3、运行程序
url = "http://www.jd.com"
spider = MySpider()
spider.executeSpider(url, "手机")
4、运行结果
三、心得体会
这道题主要是复现书上的代码,让我对selenium框架有了更深的理解。
附:完整代码链接
作业②
一、实验内容
-
要求:
- 熟练掌握 Selenium 查找HTML元素、实现用户模拟登录、爬取Ajax网页数据、等待HTML元素等内容。
- 使用Selenium框架+MySQL模拟登录慕课网,并获取学生自己账户中已学课程的信息保存到MySQL中(课程号、课程名称、授课单位、教学进度、课程状态,课程图片地址),同时存储图片到本地项目根目录下的imgs文件夹中,图片的名称用课程名来存储。
-
候选网站:中国mooc网:https://www.icourse163.org
-
输出信息:MYSQL数据库存储和输出格式
表头应是英文命名例如:课程号ID,课程名称:cCourse……,由同学们自行定义设计表头:
Id cCourse cCollege cSchedule cCourseStatus cImgUrl 1 Python网络爬虫与信息提取 北京理工大学 已学3/18课时 2021年5月18日已结束 http://edu-image.nosdn.127.net/C0AB6FA791150F0DFC0946B9A01C8CB2.jpg 2......
二、代码实现
1、导入所需要的包
import os
import threading
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pymysql
import datetime
import time
import urllib.request
2、创建MySpider类
class MySpider:
# 设置请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
}
flag = 1 # 爬取MOOC课程为1,爬取SPOT课程为0
imagePath = "download_2" # 图片保存路径
def startUp(self, url):
# # Initializing Chrome browser
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
self.driver = webdriver.Chrome(chrome_options=chrome_options)
# Initializing variables
self.No = 0
self.threads = []
# Initializing database
try:
self.con = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="cyz20010726", db="datamining", charset="utf8")
self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
try:
# 如果有表就删除
self.cursor.execute("delete from courses")
except:
pass
try:
# 建立新的表
sql = "create table courses (cId int primary key, cCourse varchar(256), cCollege varchar(256), cSchedule varchar(256), cCourseStatus varchar(256), cImgUrl varchar(256))"
self.cursor.execute(sql)
except:
pass
except Exception as err:
print(err)
# Initializing images folder
try:
if not os.path.exists(MySpider.imagePath):
os.mkdir(MySpider.imagePath)
images = os.listdir(MySpider.imagePath)
for img in images:
s = os.path.join(MySpider.imagePath, img)
os.remove(s)
except Exception as err:
print(err)
self.driver.get(url)
self.driver.find_element_by_xpath("//div[@class='_1Y4Ni']/div[@class='_3uWA6']").click() # 找到登录/注册按钮
time.sleep(2)
self.driver.find_element_by_xpath("//div[@class='ux-login-set-scan-code_ft']/span").click() # 其他方式登录
time.sleep(2)
self.driver.find_element_by_xpath("//ul[@class='ux-tabs-underline_hd']//li[2]").click() # 手机号登录
time.sleep(2)
self.driver.switch_to.frame(self.driver.find_elements_by_tag_name("iframe")[1].get_attribute('id'))
input_tel = self.driver.find_element_by_xpath("//form[@id='login-form']//div[@class='inputbox']//div[@class='u-input box']//input[@id='phoneipt']") # 手机号输入位置
input_password = self.driver.find_element_by_xpath("//form[@id='login-form']//div[@class='u-input box']//input[@class='j-inputtext dlemail']") # 密码输入位置
input_tel.send_keys("xxxxxxxxx") # 输入手机号
time.sleep(2)
input_password.send_keys("xxxxxxxxxx") # 输入密码
time.sleep(2)
self.driver.find_element_by_xpath("//a[@id='submitBtn']").click() # 登录按钮
time.sleep(2)
myCourse = self.driver.find_element_by_xpath("//div[@id='j-indexNav-bar']//div[@class='u-navLogin-myCourse-t']//a").get_attribute("href") # 个人中心
self.driver.get(myCourse)
def download(self, src, mFile):
data = None
if src:
try:
req = urllib.request.Request(src, headers=MySpider.headers)
resp = urllib.request.urlopen(req, timeout=10)
data = resp.read()
except:
pass
if data:
print("download begin", mFile)
fobj = open(MySpider.imagePath + "\\" + mFile, "wb")
fobj.write(data)
fobj.close()
print("download finish", mFile)
def closeUp(self):
try:
self.con.commit()
self.con.close()
self.driver.close()
except Exception as err:
print(err)
def processSpider(self):
try:
time.sleep(1)
divs = self.driver.find_elements_by_xpath("//div[@class='course-panel-body-wrapper']/div[@class='course-card-wrapper']") # 课程表
for div in divs:
self.No += 1
try:
name = div.find_element_by_xpath(".//div[@class='body']//span[@class='text']").text
except Exception:
name = ""
try:
college = div.find_element_by_xpath(".//div[@class='school']//a").text
except Exception:
college = ""
try:
schedule = div.find_element_by_xpath(".//div[@class='personal-info']//span[@class='course-progress-text-span']").text
except Exception:
schedule = ""
try:
status = div.find_element_by_xpath(".//div[@class='course-status']").text
except:
status = ""
try:
img = div.find_element_by_xpath(".//div[@class='img']/img").get_attribute("src")
except:
img = ""
if img:
# 下载图片并重命名
mFile = str(self.No) + '.jpg'
src = urllib.request.urljoin(self.driver.current_url, img)
T = threading.Thread(target=self.download, args=(src, mFile))
T.setDaemon(False)
T.start()
self.threads.append(T)
else:
mFile = ""
self.cursor.execute(
"insert into courses (cId, cCourse, cCollege, cSchedule, cCourseStatus, cImgUrl) values (%s, %s, %s, %s, %s, %s)",
(self.No, name, college, schedule, status, mFile))
if MySpider.flag: # flag = 1表示爬完MOOC课程
MySpider.flag = 0 # 爬取SPOT课程
spot_url = self.driver.find_element_by_xpath("//div[@id='j-module-tab']//div[@class='item u-st-spoc-course ga-click']//a") # SPOT课程
spot_url.click()
self.processSpider()
except Exception as err:
print(err)
def executeSpider(self, url):
starttime = datetime.datetime.now()
print("Spider starting......")
self.startUp(url)
print("Spider processing......")
self.processSpider()
print("Spider closing......")
self.closeUp()
print("Spider completed......")
endtime = datetime.datetime.now()
elapsed = (endtime - starttime).seconds
print("Total ", elapsed, " seconds elapsed")
3、运行程序
url = "https://www.icourse163.org/"
spider = MySpider()
spider.executeSpider(url)
4、运行结果
三、心得体会
这道题还是利用selenium框架爬取数据,不一样的是需要模拟登录,开始的时候由于MOOC在维护,无法登录。之后服务器修复好再次登录的时候,发现找不到手机号和密码输入的位置,找了资料说要使用self.driver.switch_to.frame(self.driver.find_elements_by_tag_name("iframe")[1].get_attribute('id')),但是还是没研究出所以然来。使用之后发现还是登录不进去,卡了好久发现自己定位手机号登录定位到邮箱登录了。。。经过了一波三折,最后还是成功爬取到所需信息。
附:完整代码链接
作业③:Flume日志采集实验
要求:掌握大数据相关服务,熟悉Xshell的使用
一、环境搭建
任务一:开通MapReduce服务
二、实时分析开发实战:
任务一:Python脚本生成测试数据
-
编写Python脚本
-
创建目录
-
测试执行
任务二:配置Kafka
- 设置环境变量
- 在kafka中创建topic
任务三:安装Flume客户端
- 打开flume服务界面
- 点击下载客户端
- 解压下载的flume客户端文件
- 校验文件包
- 解压“MRS_Flume_ClientConfig.tar”文件
- 安装Flume环境变量
- 解压Flume客户端
- 安装Flume客户端
- 重启Flume服务
任务四:配置Flume采集数据
- 修改配置文件
- 创建消费者消费kafka中的数据
三、心得体会
初次使用华为云,有点陌生,不过一些命令还是跟linux一样,所以用起来也不是很生疏,同时也为后面的实验做铺垫。