京东华为手机商品信息采集与数据分析

一、选题的背景

随着电商的迅速发展，越来越多的都会选择网络购物。本文通过爬取京东华为手机的商品相关信息，并根据爬取到数据做数据分析，得出有效的结论，为消费者和商家提出可行的建议。

二、主题式网络爬虫的设计方案

1.主题式网络爬虫名称

京东华为手机商品信息采集与数据分析

2.主题式网络爬虫爬取的内容与数据特征分析

目标网址https://www.jd.com/

主要爬取的内容包括商品的标题、价格、评论数量、店铺名称、商品唯一标识ID、店铺标签和详情页网址等信息。

3.主题式网络爬虫的设计方案

爬取方式选择selenium自动化采集的方法，使用到的Python包有time、selenium、lxml和openpyxl，技术难点在于数据的获取和解析部分。爬取过程设计如下图：

三、数据页面的结构特征分析

打开京东网站，在搜索框输入“华为手机”，然后右键检查，可以看到如下的网页结构：

可以发现，页面为标准的静态网页，可以选择xpath解析。由于网页结果层级过于复杂，这里不便于或者不必要画出节点的结构图，因为可以使用鼠标点击确定所爬取字段的xpath表达式。

四、网络爬虫程序设计

1.数据爬取与采集

数据采集代码如下：

  1 #  导入相关包
  2 import time
  3 from selenium import webdriver
  4 from selenium.webdriver.support import expected_conditions as EC
  5 from selenium.webdriver.common.by import By
  6 from selenium.webdriver.support.ui import WebDriverWait
  7 from lxml import etree
  8 from openpyxl import Workbook
  9 
 10 
 11 # 数据存储
 12 wb = Workbook()
 13 sheet = wb.active
 14 sheet['A1'] = 'name'
 15 sheet['B1'] = 'price'
 16 sheet['C1'] = 'commit'
 17 sheet['D1'] = 'shop'
 18 sheet['E1'] = 'sku'
 19 sheet['F1'] = 'icons'
 20 sheet['G1'] = 'detail_url'
 21 
 22 options = webdriver.ChromeOptions()
 23 # 不加载图片，提高数据爬取速度
 24 options.add_experimental_option('prefs', {'profile.managed_default_content_settings.images': 2})
 25 driver = webdriver.Chrome(options=options)
 26 wait = WebDriverWait(driver, 45)  # 设置等待时间为45秒
 27 
 28 
 29 def search(keyword):
 30     try:
 31         input = wait.until(
 32             EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#key"))
 33         )  # 等到搜索框加载出来
 34         submit = wait.until(
 35             EC.element_to_be_clickable((By.CSS_SELECTOR, "#search > div > div.form > button"))
 36         )  # 等到搜索按钮可以被点击
 37         input[0].send_keys(keyword)  # 向搜索框内输入关键词
 38         submit.click()  # 点击
 39         wait.until(
 40             EC.presence_of_all_elements_located(
 41                 (By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > em:nth-child(1) > b')
 42             )
 43         )
 44         total_page = driver.find_element_by_xpath('//*[@id="J_bottomPage"]/span[2]/em[1]/b').text
 45         return int(total_page)  # 返回最大页数
 46     except TimeoutError:
 47         search(keyword)
 48 
 49 def get_data(html):
 50     selec_data = etree.HTML(html)
 51     lis = selec_data.xpath('//ul[@class="gl-warp clearfix"]/li')
 52     for li in lis:
 53         try:
 54             title = li.xpath('.//div[@class="p-name p-name-type-2"]//em/text()')[0].strip()   # 名字
 55             price = li.xpath('.//div[@class="p-price"]//i/text()')[0].strip()   # 价格
 56             comment = li.xpath('.//div[@class="p-commit"]//a/text()')  # 评论数
 57             shop_name = li.xpath('.//div[@class="p-shop"]//a/text()')  # 商铺名字
 58             data_sku = li.xpath('.//div[@class="p-focus  "]/a/@data-sku')[0]  # 店铺id
 59             icons = li.xpath('.//div[@class="p-icons"]/i/text()')  # 备注
 60             comment = comment[0] if comment != [] else ''
 61             shop_name = shop_name[0] if shop_name != [] else ''
 62             icons_n = ''
 63             for x in icons:
 64                 icons_n = icons_n + ',' + x
 65             detail_url = li.xpath('.//div[@class="p-name p-name-type-2"]/a/@href')[0]  # 详情页网址
 66             detail_url = 'https:' + detail_url
 67             item = [title, price, comment, shop_name, data_sku, icons_n[1:], detail_url]
 68             print(item)
 69             sheet.append(item)
 70         except TimeoutError:
 71             get_data(html)
 72 
 73 def main():
 74     url_main = 'https://www.jd.com/'
 75     keyword = input('请输入商品名称:')  # 关键词
 76     driver.get(url=url_main)
 77     page = search(keyword)
 78     i = 1
 79     for p in range(3, page*2, 2):
 80         if i == 1:
 81             url = 'https://search.jd.com/Search?keyword={}&page={}&s={}&click=0'.format(keyword, p, i)
 82         else:
 83             url = 'https://search.jd.com/Search?keyword={}&page={}&s={}&click=0'.format(keyword, p, (i-1)*50)
 84         driver.get(url)
 85         time.sleep(1)
 86         driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")  # 自动下滑到底部
 87         time.sleep(3)
 88         driver.implicitly_wait(20)
 89         wait.until(
 90             EC.presence_of_all_elements_located((By.XPATH, '//*[@id="J_goodsList"]/ul/li[last()]'))
 91         )
 92         html = driver.page_source  # 获取网页源代码
 93         get_data(html)  # 把网页源代码传给解析函数解析
 94         time.sleep(1)
 95         print('正在爬取第{i}页')
 96         i += 1
 97     wb.save('京东{}信息.xlsx'.format(keyword))
 98 
 99 
100 if __name__ == '__main__':
101     main()  # 启动爬虫

运行结果：

2.对数据进行清洗与处理

2.1 删除缺失数据：

1 df.dropna(inplace=True)
2 df.reset_index(drop=True, inplace=True)

2.1 评论数量数据清洗：

 1 sales = []
 2 for p in df['commit'].values:
 3     if '万' in p:
 4         sales.append(int(p.split('万')[0]) * 10000)
 5     elif '+' in p: 
 6         sales.append(int(p[:-1]))
 7     elif '去' in p:
 8         sales.append(1000)
 9     else:
10         sales.append(int(p))
11 df['commit'] = sales

3.数据分析与可视化

3.1 华为手机价格直方图

1 plt.hist(df['price'], bins=40, facecolor='blue', edgecolor='black') 
2 # 显示横轴标签
3 plt.xlabel('价格区间')
4 # 显示纵轴标签
5 plt.ylabel('频数/频率')
6 # 显示图标题
7 plt.title('华为手机价格频数/频率分布直方图')
8 plt.savefig('华为手机价格频数频率分布直方图.png')
9 plt.show()

3.2 华为手机评论数量直方图

1 plt.hist(df['commit'], bins=5, facecolor='blue', edgecolor='black')
2 # 显示横轴标签
3 plt.xlabel('评论数量区间')
4 # 显示纵轴标签
5 plt.ylabel('频数/频率')
6 # 显示图标题
7 plt.title('华为手机评论数量频数/频率分布直方图')
8 plt.savefig('华为手机评论数量频数频率分布直方图.png')
9 plt.show()

3.3 华为手机前十店铺

 1 shop_va = df['shop'].value_counts()
 2 plt.figure(figsize=(25, 10))
 3 plt.bar(shop_va[:10].index, shop_va[:10].values,facecolor='red', edgecolor='black')
 4 # 显示横轴标签
 5 plt.xlabel('店铺名称', fontsize=18)
 6 # 显示纵轴标签
 7 plt.ylabel('数量', fontsize=18)
 8 # 显示图标题
 9 plt.xticks(fontsize=12)
10 plt.yticks(fontsize=15)
11 plt.title('华为手机前十店铺', fontsize=25)
12 plt.savefig('华为手机前十店铺.png')
13 plt.show()

4.华为手机价格与评论数量之间的一线回归模型

4.1 华为手机价格与评论数量的散点图

 1 plt.figure(figsize=(15, 10))
 2 plt.scatter(df['price'], df['commit'],facecolor='red', edgecolor='black')
 3 # 显示横轴标签
 4 plt.xlabel('价格', fontsize=18)
 5 # 显示纵轴标签
 6 plt.ylabel('评论数量', fontsize=18)
 7 # 显示图标题
 8 plt.xticks(fontsize=12)
 9 plt.yticks(fontsize=15)
10 plt.title('华为手机价格与评论数量的散点图', fontsize=25)
11 plt.savefig('华为手机价格与评论数量的散点图.png')
12 plt.show()

由以上散点图可知，华为手机的价格与评论数据明显一元线性回归的关系，下面构建一线线性回归模型分析。其中，自变量为价格，因变量为评论数量。

1 model = LinearRegression()
2 model.fit(np.array(df['price'].values).reshape(-1, 1), np.array(df['commit'].values).reshape(-1, 1))
3 model.intercept_
4 model.coef_

由此可以得到华为手机价格与评论数量之间的一线回归模型方程为：

3159.47455023 - 0.22253572x

说明随着华为手机价格的升高，人们的评论趋向于减少。

完整代码

  1 #  导入相关包
  2 import time
  3 from selenium import webdriver
  4 from selenium.webdriver.support import expected_conditions as EC
  5 from selenium.webdriver.common.by import By
  6 from selenium.webdriver.support.ui import WebDriverWait
  7 from lxml import etree
  8 from openpyxl import Workbook
  9 
 10 
 11 # 数据存储
 12 wb = Workbook()
 13 sheet = wb.active
 14 sheet['A1'] = 'name'
 15 sheet['B1'] = 'price'
 16 sheet['C1'] = 'commit'
 17 sheet['D1'] = 'shop'
 18 sheet['E1'] = 'sku'
 19 sheet['F1'] = 'icons'
 20 sheet['G1'] = 'detail_url'
 21 
 22 options = webdriver.ChromeOptions()
 23 # 不加载图片，提高数据爬取速度
 24 options.add_experimental_option('prefs', {'profile.managed_default_content_settings.images': 2})
 25 driver = webdriver.Chrome(options=options)
 26 wait = WebDriverWait(driver, 45)  # 设置等待时间为45秒
 27 
 28 
 29 def search(keyword):
 30     try:
 31         input = wait.until(
 32             EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#key"))
 33         )  # 等到搜索框加载出来
 34         submit = wait.until(
 35             EC.element_to_be_clickable((By.CSS_SELECTOR, "#search > div > div.form > button"))
 36         )  # 等到搜索按钮可以被点击
 37         input[0].send_keys(keyword)  # 向搜索框内输入关键词
 38         submit.click()  # 点击
 39         wait.until(
 40             EC.presence_of_all_elements_located(
 41                 (By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > em:nth-child(1) > b')
 42             )
 43         )
 44         total_page = driver.find_element_by_xpath('//*[@id="J_bottomPage"]/span[2]/em[1]/b').text
 45         return int(total_page)  # 返回最大页数
 46     except TimeoutError:
 47         search(keyword)
 48 
 49 def get_data(html):
 50     selec_data = etree.HTML(html)
 51     lis = selec_data.xpath('//ul[@class="gl-warp clearfix"]/li')
 52     for li in lis:
 53         try:
 54             title = li.xpath('.//div[@class="p-name p-name-type-2"]//em/text()')[0].strip()   # 名字
 55             price = li.xpath('.//div[@class="p-price"]//i/text()')[0].strip()   # 价格
 56             comment = li.xpath('.//div[@class="p-commit"]//a/text()')  # 评论数
 57             shop_name = li.xpath('.//div[@class="p-shop"]//a/text()')  # 商铺名字
 58             data_sku = li.xpath('.//div[@class="p-focus  "]/a/@data-sku')[0]  # 店铺id
 59             icons = li.xpath('.//div[@class="p-icons"]/i/text()')  # 备注
 60             comment = comment[0] if comment != [] else ''
 61             shop_name = shop_name[0] if shop_name != [] else ''
 62             icons_n = ''
 63             for x in icons:
 64                 icons_n = icons_n + ',' + x
 65             detail_url = li.xpath('.//div[@class="p-name p-name-type-2"]/a/@href')[0]  # 详情页网址
 66             detail_url = 'https:' + detail_url
 67             item = [title, price, comment, shop_name, data_sku, icons_n[1:], detail_url]
 68             print(item)
 69             sheet.append(item)
 70         except TimeoutError:
 71             get_data(html)
 72 
 73 def main():
 74     url_main = 'https://www.jd.com/'
 75     keyword = input('请输入商品名称:')  # 关键词
 76     driver.get(url=url_main)
 77     page = search(keyword)
 78     i = 1
 79     for p in range(3, page*2, 2):
 80         if i == 1:
 81             url = 'https://search.jd.com/Search?keyword={}&page={}&s={}&click=0'.format(keyword, p, i)
 82         else:
 83             url = 'https://search.jd.com/Search?keyword={}&page={}&s={}&click=0'.format(keyword, p, (i-1)*50)
 84         driver.get(url)
 85         time.sleep(1)
 86         driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")  # 自动下滑到底部
 87         time.sleep(3)
 88         driver.implicitly_wait(20)
 89         wait.until(
 90             EC.presence_of_all_elements_located((By.XPATH, '//*[@id="J_goodsList"]/ul/li[last()]'))
 91         )
 92         html = driver.page_source  # 获取网页源代码
 93         get_data(html)  # 把网页源代码传给解析函数解析
 94         time.sleep(1)
 95         print('正在爬取第{i}页')
 96         i += 1
 97     wb.save('京东{}信息.xlsx'.format(keyword))
 98 
 99 
100 if __name__ == '__main__':
101     main()  # 启动爬虫
102 
103 #数据分析
104 #!/usr/bin/env python
105 # coding: utf-8
106 
107 # In[53]:
108 
109 
110 import pandas as pd
111 import numpy as np
112 import matplotlib.pyplot as plt
113 from sklearn.linear_model import LinearRegression
114 
115 
116 # In[18]:
117 
118 
119 # matplotlib基本配置
120 plt.rcParams['font.sans-serif'] = ['FangSong']  # 指定默认字体为仿宋
121 plt.rcParams['axes.unicode_minus'] = False     # 解决保存图像是负号'-'显示为方块的问题
122 
123 
124 # In[11]:
125 
126 
127 df = pd.read_excel('京东华为手机信息.xlsx')
128 
129 
130 # ## 数据清洗
131 
132 # ### 删除缺失数据
133 
134 # In[12]:
135 
136 
137 df.dropna(inplace=True)
138 df.reset_index(drop=True, inplace=True)
139 
140 
141 # In[15]:
142 
143 
144 df.info()
145 
146 
147 # In[13]:
148 
149 
150 df.head()
151 
152 
153 # ### 评论数量数据清洗
154 
155 # In[16]:
156 
157 
158 sales = []
159 for p in df['commit'].values:
160     if '万' in p:
161         sales.append(int(p.split('万')[0]) * 10000)
162     elif '+' in p: 
163         sales.append(int(p[:-1]))
164     elif '去' in p:
165         sales.append(1000)
166     else:
167         sales.append(int(p))
168 
169 
170 # In[17]:
171 
172 
173 df['commit'] = sales
174 
175 
176 # In[41]:
177 
178 
179 df.head()
180 
181 
182 # ## 数据分析与可视化
183 
184 # ### 华为手机价格直方图
185 
186 # In[23]:
187 
188 
189 plt.hist(df['price'], bins=40, facecolor='blue', edgecolor='black')
190 # 显示横轴标签
191 plt.xlabel('价格区间')
192 # 显示纵轴标签
193 plt.ylabel('频数/频率')
194 # 显示图标题
195 plt.title('华为手机价格频数/频率分布直方图')
196 plt.savefig('华为手机价格频数频率分布直方图.png')
197 plt.show()
198 
199 
200 # ### 华为手机评论数量直方图
201 
202 # In[26]:
203 
204 
205 plt.hist(df['commit'], bins=5, facecolor='blue', edgecolor='black')
206 # 显示横轴标签
207 plt.xlabel('评论数量区间')
208 # 显示纵轴标签
209 plt.ylabel('频数/频率')
210 # 显示图标题
211 plt.title('华为手机评论数量频数/频率分布直方图')
212 plt.savefig('华为手机评论数量频数频率分布直方图.png')
213 plt.show()
214 
215 
216 # In[28]:
217 
218 
219 shop_va = df['shop'].value_counts()
220 
221 
222 # In[30]:
223 
224 
225 shop_va[:10]
226 
227 
228 # ### 华为手机前十店铺
229 
230 # In[40]:
231 
232 
233 plt.figure(figsize=(25, 10))
234 plt.bar(shop_va[:10].index, shop_va[:10].values,facecolor='red', edgecolor='black')
235 # 显示横轴标签
236 plt.xlabel('店铺名称', fontsize=18)
237 # 显示纵轴标签
238 plt.ylabel('数量', fontsize=18)
239 # 显示图标题
240 plt.xticks(fontsize=12)
241 plt.yticks(fontsize=15)
242 plt.title('华为手机前十店铺', fontsize=25)
243 plt.savefig('华为手机前十店铺.png')
244 plt.show()
245 
246 
247 # ## 价格与评论数量之间的一线回归模型
248 
249 # ### 价格与评论数据的散点图
250 
251 # In[44]:
252 
253 
254 plt.figure(figsize=(15, 10))
255 plt.scatter(df['price'], df['commit'],facecolor='red', edgecolor='black')
256 # 显示横轴标签
257 plt.xlabel('价格', fontsize=18)
258 # 显示纵轴标签
259 plt.ylabel('评论数量', fontsize=18)
260 # 显示图标题
261 plt.xticks(fontsize=12)
262 plt.yticks(fontsize=15)
263 plt.title('华为手机价格与评论数量的散点图', fontsize=25)
264 plt.savefig('华为手机价格与评论数量的散点图.png')
265 plt.show()
266 
267 
268 # ### 价格与评论数据的一元线性回归模型
269 
270 # #### 模型构建与训练
271 
272 # In[46]:
273 
274 
275 model = LinearRegression()
276 
277 
278 # In[58]:
279 
280 
281 model.fit(np.array(df['price'].values).reshape(-1, 1), np.array(df['commit'].values).reshape(-1, 1))
282 
283 
284 # #### 模型的截距项
285 
286 # In[61]:
287 
288 
289 model.intercept_
290 
291 
292 # #### 模型的特征系数
293 
294 # In[62]:
295 
296 
297 model.coef_
298 
299 
300 # In[ ]:

五、结论

通过对京东华为手机商品数据做的数据分析，可以得出以下结论：

1.京东上的华为手机售价大多是5000元以下，其中3000元左右居多。

2.京东上的华为手机商品评论数据基本是100000条以下，说明销售量还可以。

3.随着华为手机价格的升高，人们的评论增长趋向于减少。这点符合人们对商品的评价规律。

posted @ 2021-12-27 15:21 邑俞阅读(981) 评论(1) 收藏举报

刷新页面返回顶部

京东华为手机商品信息采集与数据分析

公告