利用urllib.request获取网页文本或图片至本地
#!/bin/env python #-*- coding:utf-8 -*- from urllib import request import time img_path = 'E:\\ROBOT\\python\\_post2.jpg'#保存图片路径 #需要下载的图片链接 liso = 'https://i5.walmartimages.com/asr/7b670288-5610-40f9-9ed8-45c1fc8520f8.b173e857205163756825165ebdce9d0e.jpeg?odnHeight=2000&odnWidth=2000&odnBg=FFFFFF' #请求头 headers = [("User-agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36")] opener = request.build_opener() time.sleep(2) opener.addheaders = headers time.sleep(2) request.install_opener(opener) time.sleep(2) request.urlretrieve(liso, img_path)
#获取网页文本
#!/bin/env python # -*- coding: utf-8 -*- import sys import os import requests from lxml import etree sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) text = [] target='http://www.lizhigushi.com/haocihaoju/a7319.html' get = requests.get(target) # get(url) 得到我们的网页, text将源网页转化为字符串 get.encoding = "utf-8"#定义数据格式,避免乱码 get=get.text selector = etree.HTML(get) # 将源码转换为xpath可以识别的TML格式 info=selector.xpath('//div[@class="content"]/p//text()') for i in info: comp_string = i new_string = ''.join([i for i in comp_string if not i.isdigit()]) new_string = new_string.replace('. ', '') new_string = new_string.replace('[', '') new_string = new_string.replace('lizhigushi.com', '') new_string = new_string.replace(']。', '') new_string = new_string.replace('\n', '') # text.append(new_string) print(new_string)