python 爬虫基本示例
示例一:
import urllib.request
url = "https://xxxxxx"
req = urllib.request.Request('http://www.baidu.com')
response = urllib.request.urlopen(req)
print(response.info())
print(response.getcode())
print(response.geturl())
示例二:
#中英文翻译
import urllib.request
import urllib.parse
import json
import sys
def open_url():
data = {}
url = "http://fy.iciba.com/ajax.php?a=fy"
data['f'] = 'auto'
data['t'] = 'auto'
data['w'] = content
data = urllib.parse.urlencode(data).encode('UTF-8')
# response = urllib.request.urlopen(url,data)
req = urllib.request.Request(url,data)
response = urllib.request.urlopen(req)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2132.2 Safari/537.36X-Requested-With:XMLHttpRequest')
html = response.read()
return html
def english(html):
"""
英文翻译
"""
html = html.decode('utf-8')
target = json.loads(html)
# print(target)
try:
print("翻译结果: %s" % target['content']['word_mean'][0])
except :
print("翻译结果: %s" % target['content']['out'])
def chin(html):
"""
中文翻译
"""
html = html.decode('utf-8')
target = json.loads(html)
# print(target)
print("翻译结果: %s" % target['content']['out'])
#python 判断是不是中文字
def Chinese(str):
str1 = None
for i in str.split():
if i >= '\u4e00' and i<= '\u9fa5':
str1 = True
if str1:
return True
else:
return False
if __name__ == "__main__":
while True:
content = input("请输入需要翻译的内容(退出q): ").strip()
if content == 'q':
sys.exit()
else:
if Chinese(content):
chin(open_url())
else:
english(open_url())
示例三:
#图片下载例子
import urllib.request
import re
def open_url(url):
req = urllib.request.Request(url)
req.add_header('User-agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2132.2 Safari/537.36X-Requested-With:XMLHttpRequest')
page = urllib.request.urlopen(req)
html = page.read().decode('utf-8')
return html
def get_img(html):
p = r'<img class="BDE_Image" src="([^"]+\.jpg)"'
imglist = re.findall(p, html)
for each in imglist:
print(each)
for each in imglist:
filename = each.split('/')[-1]
#图片下载
urllib.request.urlretrieve(each, filename, None)
if __name__ == '__main__':
url = "http://tieba.baidu.com/p/3563409202"
get_img(open_url(url))
示例四:
#匹配IP地址例子
import urllib.request
import re
def open_url(url):
req = urllib.request.Request(url)
req.add_header('User-agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2132.2 Safari/537.36X-Requested-With:XMLHttpRequest')
page = urllib.request.urlopen(req)
html = page.read().decode('utf-8')
return html
def get_img(html):
p = r'(?:(?:[0-1]?\d?\d|2[0-4]\d|25[0-5])\.){3}(?:[0-1]?\d?\d|2[0-4]\d|25[0-5])'
iplist = re.findall(p, html)
for each in iplist:
print(each)
if __name__ == '__main__':
url = "http://www.xicidaili.com"
get_img(open_url(url))

浙公网安备 33010602011771号