import requests
from bs4 import BeautifulSoup
class GetWebData:
def __init__(self):
self.headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': 'JSESSIONID=59D7AE73DA0256B8DACA9712795B8EB5',
'Host': '61.142.120.214:9000',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
}
self.urlHead = 'http://61.142.120.214:9000/web/salepermit.jsp?page='
self.urlTail = '&&projectname=&&code=&&compname=&&address=&&date1=&&date2='
def collectDataFromURL(self):
for i in range(1, 11, 1):
url = "{}{}{}".format(self.urlHead, i, self.urlTail)
try:
response = requests.get(url, headers=self.headers, timeout=20)
except:
print("请求 {} 超时".format(url))
continue
if response.status_code == 200:
print("{} 请求成功".format(url))
response.encoding = 'GBK'
soup = BeautifulSoup(response.text, 'lxml')
self.getOnePage(soup)
else:
print("{}请求失败,返回值为{}".format(url, response.status_code))
def getOnePage(self, soup):
for tr in soup.find_all('tr'):
tds = tr.find_all('td')
print(tds)
crawler = GetWebData()
crawler.collectDataFromURL()