上周学习进度——java爬虫,Python基础
直接上代码:
java爬虫,爬取网页数据
import org.jsoup.Jsoup;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.net.URL;
public class HtmlParseUtil {
public static void main(String[] args) throws IOException {
//获取请求 https://www.tmall.com/
//需要联网,ajax 获取不到
String url = "https://search.jd.com/Search?keyword=%E4%BC%91%E9%97%B2%E9%9E%8B%E7%94%B7&enc=utf-8&wq=%E4%BC%91%E9%97%B2%E9%9E%8B%E7%94%B7&pvid=38442c036b964f0e8e703d4f5dacc6fe";
//解析网页 (Jsoup返回Document就是Document对象)
Document document = Jsoup.parse(new URL(url),3000);
//所有js能使用的方法,这里都能用!
Element element = document.getElementById("J_goodsList");
//获取所有的li标签
Elements elements = element.getElementsByTag("li");
//获取元素中的内容 这里的el 就是没一个li标签了
for (Element el:elements){
String img=el.getElementsByTag("img").eq(0).attr("src");
String price = el.getElementsByClass("p-price").eq(0).text();
String title = el.getElementsByClass("p-name").eq(0).text();
System.out.println("******************************************");
System.out.println(img);
System.out.println(price);
System.out.println(title);
}
System.out.println(element.html());
}
}
Python基础:
# -*- codeing = utf-8 -*-
# @Time : 23:54
# @Auther : wyt
# @File : demo1.py
# @Software : PyCharm
'''
age = 18
print("我已经%d岁了!"%age)
'''
'''
name="小张"
abc="中国"
print("我的名字是%s,我的国籍是%s"%(name,abc))
'''
'''
#分割:
print("aaa","bbb","ccc")
print("www","baidu","com",sep=".")
'''
#不换行:
'''
print("hello",end="") #在输出内容后面直接继续输出
print("hello",end="\t") #在输出内容后面进行一小段的间隔
print("hello",end="\n") #换行输出
print("end")
'''
'''
password = 'input("请输入密码:")
print("您刚刚输入的密码是:",password)
print(type(password))
'''
'''
#强制转换直接int
a=int("123")
b = a+100
print(b)
'''
'''
#if语句
score= 77
if score>80 and score<90 :
print("优秀!")
elif score>90:
print("太牛了哥~")
elif score>70 and score<80:
print("不太行啊宝儿")
elif score>0 and score<60:
print("废物啊~")
else:
print("勉强及格~")
'''
'''
#随机数
import random
x = random.randint(0,10) #随机生成0-10的数
print(x)
'''
#循环语句:
#for循环:
'''
for i in range(5): #i默认为0 输入0-4
print(i)
'''
'''
for i in range(0,10,3): #区间:0-10 步长3
print(i)
'''
'''
for i in range (-10,-100,-30):
print(i)
'''
'''
name = "tangshan"
for x in name:
print(x,end="\t")
'''
'''
a = ["aa","bb","cc","dd"]
for i in range(len(a)):
print(i,a[i])
'''
#while:
'''
i=0
while i<5:
print("当前是第%d次执行循环"%(i+1))
print("i=%d"%i)
i=i+1
'''
# 1-100求和
'''
n=100
sum=0
counter=1
while counter <=n:
sum=sum+counter
counter+=1
print("1~%d的和为:%d"%(n,sum))
'''
'''
count = 0
while count<5:
print(count,"小于5")
count+=1
else:
print(count,"大于等于5")
'''
#字符串
'''
word = '字符串'
sentence = "这是个句子"
paragraph = """
这是一个段落
可以有多行组成
"""
print(word)
print(sentence)
print(paragraph)
'''
#单引号和双引号的区别:
'''
my_str = "I'm a student"
my_str1 = 'I\'m a student'
print(my_str1)
'''
'''
my_str = "Jason said \"I like you\""
print(my_str)
'''
'''
str = "chengdu"
string = "凌云"
print(str)
print(str[0:7]) #第一个是起始位置,第二个是结束位置,不输出结束位置的内容
print(str[0:7:2]) #步长为2
print(string[0])
print(str+"你好") #字符串连接
print(str*3) # 连续打印多次
print(r"hello\nchengdu") # 前面有个r,后面的转义字符\失效
'''
#列表:
#定义一个空列表:nameList = []
nameList = ["凌云","阎亮","戴伟伟","王玉昙"]
'''
print(nameList[0])
print(nameList[1])
print(nameList[2])
print(nameList[3])
testList = [1,"aaa"]
print(testList[0])
print(testList[1])
print((type(testList)))
'''
#遍历:
'''
for name in nameList:
print(name)
print(len(nameList))
i=0
while i<len(nameList):
print(nameList[i])
i=i+1
'''
#增加: append
'''
print("追加前:")
for name in nameList:
print(name)
nametemp = input("请输入学生姓名:")
nameList.append(nametemp)
print("增加后:")
for name in nameList:
print(name)
'''
#追加: extend
'''
a = [1,2]
b = [3,4]
a.append(b) #将列表b当场一个元素放到a
print(a)
a.extend(b) #将b列表中的每个元素,逐一追加到列表a中
print(a)
'''
#增加:insert:指定下标位置,插入元素
'''
a = [0,1,2]
a.insert(1,3) # 第一个为下标,第二个为对象
print(a)
'''
#删除:
#del:
'''
movieName = ["凌云","南日","难日","我贼","秋红"]
print("删除前:")
for name in movieName:
print(name)
del movieName[2] # 删除指定位置的元素
print("删除后:")
for name in movieName:
print(name)
'''
#删除: pop
'''
movieName = ["凌云","南日","难日","我贼","秋红"]
print("删除前:")
for name in movieName:
print(name)
movieName.pop() # 删除末尾的元素
print("删除后:")
for name in movieName:
print(name)
'''
#删除:remove
'''
movieName = ["凌云","南日","难日","我贼","秋红"]
print("删除前:")
for name in movieName:
print(name)
movieName.remove("我贼") # 直接删除指定内容(当有重复数据时,删除找到的第一个数据)
print("删除后:")
for name in movieName:
print(name)
'''
#修改:
'''
print("修改前:")
for name in nameList:
print(name)
nameList[1] = "瓜着呢" # 直接删除指定内容(当有重复数据时,删除找到的第一个数据)
print("修改后:")
for name in nameList:
print(name)
'''
#查: [in not in]
'''
findName = input("请输入你要查找的学生姓名:")
if findName in nameList:
print("在学生名单中找到了相同的名字")
else:
print("没有找到")
'''
'''
a = ["a","b","c","a","b"]
print(a.index("a",1,4)) # 查找指定下标范围的内容,从1~4中找a,如a果找到了,返回下标
# print(a.index("a",1,2)) 范围区间,左闭右开,不在范围内时会报错
print(a.count("a")) # 显示数量
'''
#排序:
'''
a=[1,4,2,3]
print(a)
a.reverse() #将列表所有元素反转
print(a)
a.sort() # 从低到高排序
print(a)
a.sort(reverse=True) # 从高带低排序
print(a)
'''
# 小测试:
'''
import random
schoolName = [["北京大学","清华大学"],["南开大学","天津大学","天津师范大学"],["山东大学","中国海洋大学"]] #有三个元素的空列表
print(schoolName[0][0])
offices = [[],[],[]]
names = ["A","B","C","D","E","F","G","H"]
for name in names:
index = random.randint(0,2)
offices[index].append(name)
i=1
for office in offices:
print("办公室%d人数为:%d"%(i,len(office)))
i=i+1
for name in office:
print("%s"%name,end="\t")
print("\n")
print("-"*20)
'''
# 元组:元组中的元素不可改变,但是能包含可以改变的对象
'''
tup1 = () # 创建空的元组
# tup2 = (50) <class 'int'>
tup2 = (50,)
print(type(tup2)) #<class 'tuple'>
print(type(tup1))
'''
'''
#将其他内容转化为元组:
tuple(需要转化的内容)
'''
'''
tup1 = ("abc","def",2000,2020,333,444,555)
print(tup1[0])
print(tup1[-1]) #访问最后一个元素
print(tup1[1:5]) #左闭右开,进行切片
'''
# 增: 连接
'''
tup1 = (12,34,56)
#tup1[0]=100 报错,不支持修改
tup2 = ("abc","xyz")
tup = tup1+tup2
print(tup)
'''
#删:
'''
tup1 = (12,34,56)
print(tup1)
del tup1 #删除了整个元组,整个变量,不允许删除具体值
print("删除后:")
print(tup1)
'''
#改:
'''
tup1 = (12,34,56)
#tup1[0]=100 报错,不支持修改
print(tup1)
'''
#查:就是直接访问
#字典:(map)
'''
info = {"name":"吴彦祖","age":17}
#字典的访问:
print(info["name"])
print(info["age"])
#访问不存在的键
#print(info["gender"]) #直接访问会报错
#print(info.get("gender")) #使用get方法,没找到对应的键会默认返回一个None
print(info.get("gender","m")) #没找到时,设定默认值为:m,找得到是,默认值不产生作用
'''
#增
'''
info = {"name":"吴彦祖","age":17}
newID=input("请输入一个新学号")
info["id"] = newID
print(info["id"])1
'''
#删
#del clear
'''
info = {"name":"吴彦祖","age":17}
print("删除前:%s"%info["name"])
del info["name"]
print("删除后:%s"%info.get("name","无"))
'''
'''
info = {"name":"吴彦祖","age":17}
print("删除前:%s"%info)
del info #删除整个字典
'''
#将里面的内容都清空:
'''
info = {"name":"吴彦祖","age":17}
print("清空前:%s"%info)
info.clear()
print("清空后:%s"%info)
'''
#改
'''
info = {"name":"吴彦祖","age":17}
info["age"] = 20
print(info["age"])
'''
#查
'''
info = {"id":1,"name":"吴彦祖","age":17}
print(info.keys()) #得到所有的键
print(info.values()) #得到所有的值
print(info.items()) #得到所有的项,每个键值对都是一个元组
#遍历所有的键
for key in info.keys():
print(key)
#遍历所有的值和项
for key,value in info.items():
print("key=%s,value=%s"%(key,value))
'''
'''
mylist = ["a","b","c","d"]
#既拿到值,又拿到下标
for i,x in enumerate(mylist):
print(i,x) #第一个参数可以进行加减操作
#其他类型转化为字典: dict() 例子: dict([(1,2),(2,3)])
'''
# -*- codeing = utf-8 -*-
# @Time : 17:03
# @Auther : wyt
# @File : demo1.py
# @Software : PyCharm
#函数的定义:
'''
def printinfo():
print("----------------------")
print("人生苦短,我用Python")
print("----------------------")
#函数的调用:
printinfo()
'''
#带参数的函数:
'''
def add2Num(a,b):
c=a+b
print(c)
add2Num(11,22)
'''
#带返回值的函数:
'''
def add2Num(a,b):
return a+b
result=add2Num(11,22)
print(add2Num(11,22))
print(result)
'''
#返回多个值的函数:
'''
def divid(a,b):
shang = a/b
yushu = a%b
return shang,yushu
sh,yu=divid(5,2) #用多个值接收返回值,逗号分割
print(sh,yu)
'''
#例题:
'''
def printOneLine():
print("-"*30)
def printNumLine(num):
i=0
while i<num:
printOneLine()
i=i+1
printNumLine(3)
'''
'''
def sum3Number(a,b,c):
return a+b+c
print(sum3Number(1,2,3))
'''
#平均值:
'''
def average3Nmuber(a,b,c):
sumResult = (a+b+c)/3.0
return sumResult
result=average3Nmuber(10,20,30)
print("平均值为%d"%result)
'''
# -*- codeing = utf-8 -*-
# @Time : 21:10
# @Auther : wyt
# @File : demo2.py
# @Software : PyCharm
'''
f = open("test.txt","w") #文件不存在时,在w模式下,自动创建一个新的
f.write("hello world i am here !") #写入文件
f.close() #关闭
'''
'''
f = open("test.txt","r") #文件不存在时,在r模式下,不会自动创建一个新的,r为只读
count=f.read(10)#读取前十个字符,在r下使用
content = f.readlines() #读出所有行,全部读完
i=1
for temp in content:
print("%d %s"%(i,temp))
i=i+1
#print(count)
#print(content)
f.close()
'''
#文件操作:重命名:os
import os
os.rename("test.txt","test1.txt")
# -*- codeing = utf-8 -*-
# @Time : 21:29
# @Auther : wyt
# @File : demo3.py
# @Software : PyCharm
try:
print("1")
f=open("123.txt","r")
print("2")
except Exception as result: #要选择相应的error捕获 不然会摆错 可以多种错误: except (IOError,NameError):
pass
print(result) #把错误打印出来
#捕获所有异常: except Exception as result:
# print(result)
# 休眠:
import time
def a():
print("1")
time.sleep(2) #休眠两秒
print("2")
浙公网安备 33010602011771号