python常用语法

详情参考：https://www.runoob.com/python/python-tutorial.html

'''python抓取数据方式>>>开始'''

# 第一种：response 获取

data = response.text

# 第二种：requests 获取

data = requests.get(link)

data = data.text

# 第三种：urlopen 获取

data = urlopen(link).read()

# Beautiful Soup自动将输入文档转换为Unicode编码，输出文档转换为utf-8编码

data = BeautifulSoup(data, "html.parser")

# 第四种：xpath 解析获取

data = response.xpath('//div[@id="endText"]').get()

# Beautiful Soup自动将输入文档转换为Unicode编码，输出文档转换为utf-8编码

data = BeautifulSoup(data, 'html.parser')

常用方法

字符串是否包含

if 'ce' in nice:

去除第一个字符

nice = nice[1:]

去除最后一个字符

nice = nice[:-1]

去除字符串左边的空格

nice.lstrip()

去除字符串右边的空格

nice.rstrip()

数组的长度

length = len(array)

nice转字符串:

nice = ''.join(nice)

或者

nice = repr(nice)

nice转json:

json.loads() #解码python json格式 json.load()# 加载python json格式文件

循环遍历:

for str in list:
    print(str)

if 'nice' in str:
    continue
    break

替换字符串中的反斜杠\

str = eval(repr(str).replace('\\', '@'))

字符串str转换成int

int_value = int(str_value)

int转换成字符串

str: str_value = str(int_value)

分割字符串

nice.rsplit(",")

解决urlopen乱码开始

typeEncode = sys.getfilesystemencoding() ##系统默认编码
infoencode = chardet.detect(html).get('encoding', 'utf-8') ##通过第3方模块来自动提取网页的编码
html = html.decode(infoencode, 'ignore').encode(typeEncode) ##先转换成unicode编码，然后转换系统编码输出

延时抓取

import time
time.sleep(3)

获取当天时间

import time
time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

时间戳转时间格式(时间戳的长度为10位才可以,否则会报此异常:OSError: [Errno 22] Invalid argument)

timeArray = time.localtime(upload_time)
upload_time = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
# 若时间戳的长度为13位,则需要变成10位的
timeArray = time.localtime(int(upload_time/1000))

为空判断

if str is None or str == '':
continue

截取字符串

str = str[0:4]

多个if判断

if 'pic-group clear' == divclass:
　　print('4张图')
elif 'pic img-do left' == divclass:
　　print('1张图')
else:
　　print('无图')

获取script的内容

soup = BeautifulSoup(html, "html.parser")
scripts = soup.select("script") # CSS 选择器

python爬虫去除网页中的script结构

import re

clear = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>', re.I)
content = clear.sub("", content)
# 去除id="content-ad"的div
clear = re.compile(r'<div id="content-ad">(.*)</div>', re.S)
content = clear.sub("", content)

posted @ 2020-09-07 20:18 踏步阅读(200) 评论(0) 收藏举报

刷新页面返回顶部