利用urllib.request获取网页文本或图片至本地

#!/bin/env python
#-*- coding:utf-8 -*-
from urllib import request
import time
img_path = 'E:\\ROBOT\\python\\_post2.jpg'#保存图片路径
#需要下载的图片链接
liso = 'https://i5.walmartimages.com/asr/7b670288-5610-40f9-9ed8-45c1fc8520f8.b173e857205163756825165ebdce9d0e.jpeg?odnHeight=2000&odnWidth=2000&odnBg=FFFFFF'
#请求头
headers = [("User-agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36")]
opener = request.build_opener()
time.sleep(2)
opener.addheaders = headers
time.sleep(2)
request.install_opener(opener)
time.sleep(2)
request.urlretrieve(liso, img_path)

  

#获取网页文本

#!/bin/env python
# -*- coding: utf-8 -*-
import sys
import os
import requests
from lxml import etree
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))



text = []
target='http://www.lizhigushi.com/haocihaoju/a7319.html'

get = requests.get(target)  # get(url) 得到我们的网页, text将源网页转化为字符串
get.encoding = "utf-8"#定义数据格式,避免乱码

get=get.text
selector = etree.HTML(get)  # 将源码转换为xpath可以识别的TML格式
info=selector.xpath('//div[@class="content"]/p//text()')

for i in info:
    comp_string = i
    new_string = ''.join([i for i in comp_string if not i.isdigit()])
    new_string = new_string.replace('. ', '')
    new_string = new_string.replace('[', '')
    new_string = new_string.replace('lizhigushi.com', '')
    new_string = new_string.replace(']。', '')
    new_string = new_string.replace('\n', '')
    # text.append(new_string)
    print(new_string)

  

posted @ 2022-03-31 11:23  基础很差  阅读(143)  评论(0编辑  收藏  举报