class INTERFACING():
def __init__(self):
self.driver_initialized = False
self.driver = ''
self.MAX_TRIALS = 2
# self.chrome_version = get_google_chrome_version()
def make_soup(self):
return BeautifulSoup(self.driver.page_source, 'lxml') # etree.HTML()
def current_url(self):
return self.driver.current_url
def get_driver(self):
# uc.TARGET_VERSION = get_google_chrome_version()
chrome_options = uc.ChromeOptions()
# chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=1920.,1080")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-popup-blocking")
chrome_options.add_argument("--profile-directory=Default")
chrome_options.add_argument("--ignore-certificate-errors")
chrome_options.add_argument("--disable-plugins-discovery")
chrome_options.add_argument("--incognito")
chrome_options.add_argument("--no-first-run")
chrome_options.add_argument("--no-service-autorun")
chrome_options.add_argument("--no-default-browser-check")
chrome_options.add_argument("--password-store=basic")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('--disable-application-cache')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument("--disable-setuid-sandbox")
chrome_options.add_argument(
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
)
self.driver = uc.Chrome(options=chrome_options, version_main="113")
# self.browser = uc.Chrome(options=chrome_options, version_main=113)
time.sleep(10)
self.driver_initialized = True
def close_driver(self):
self.driver.quit()
def get_selenium_response(self, url):
# try:
if not self.driver_initialized:
self.get_driver()
else:
pass
self.driver.get(url)
time.sleep(3)
soup = self.make_soup()
return soup
def get_page_source(self):
return self.driver.page_source
def clicking(self, xpath):
elem = self.driver.find_element(By.XPATH, xpath)
elem.click()
time.sleep(random.randint(2, 3))
def entering_values(self, xpath, value):
elem = self.driver.find_element(By.XPATH, xpath)
elem.clear()
elem.send_keys(value)
time.sleep(random.randint(2, 4))
def send_keys(self, xpath):
elem = self.driver.find_element(By.XPATH, xpath).send_keys(Keys.RETURN)
def going_back(self):
self.driver.execute_script("window.history.go(-1)")
time.sleep(1)
def refresh_page(self):
self.driver.refresh()
def close_handle(self):
self.driver.close()
def get_current_handle(self):
return self.driver.current_window_handle
def get_all_handles(self):
return self.driver.window_handles
def swtich_to_window(self, handle):
self.driver.switch_to.window(handle)
def switch_handle(self, second_handle=''):
all_handles = self.get_all_handles()
for handle in all_handles:
self.main_page_handle = self.get_current_handle()
if handle == self.main_page_handle:
continue
if second_handle and handle == second_handle:
continue
self.swtich_to_window(handle)
return handle
def close_handles(self, page_handle, second_handle):
all_handles = self.get_all_handles()
for handle in all_handles:
if handle == page_handle:
try:
self.close_handle()
except:
pass
self.swtich_to_window(second_handle)
def skip_button(self, class_item):
count = 0
while 1:
soup = self.make_soup()
try:
self.clicking(f'//a[contains(@class,"{class_item}")]')
break
except Exception as error:
print('skip button not yet visible')
if count > 3:
try:
all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr', class_=re.compile(
'el-table__row'))
return True
except:
if soup.find('span', class_='el-table__empty-text') is not None:
return True
try:
self.clicking('//span[text()="Medical Devices"]')
break
except:
pass
time.sleep(2)
count += 1
if count == 20:
break
def search_data(self, current_query,page_num):
if page_num == 1:
self.entering_values('//*[@id="home"]/main/div[1]/div[7]/div/div[2]/input',current_query)
self.clicking('//*[@id="home"]/main/div[1]/div[7]/div/div[2]/div/button')
second_handle = self.switch_handle()
self.skip_button('introjs-nextbutton')
self.skip_button("introjs-skipbutton")
soup = self.make_soup()
if soup.find('span', class_='el-table__empty-text') is not None:
pass
else:
print('Selecting 20 per page...')
count = 0
while True:
soup = self.make_soup()
try:
page_selector = soup.find('input', class_='el-input__inner')
if page_selector.attrs.get("placeholder"):
break
except Exception as error:
print('Record not yet loaded: ', count)
time.sleep(3)
count += 1
if not count % 3:
print('page refreshed....')
self.refresh_page()
if count >= 51:
break
self.clicking('//*[@id="home"]/div[3]/div[3]/div/div/span[2]/div/div[1]/input')
self.clicking(
'//ul[@class="el-scrollbar__view el-select-dropdown__list"]//span[text()="20条/页"]')
if page_num != 1:
while 1:
try:
self.entering_values('//input[@type="number"]',page_num)
break
except:
print('error in entering page num')
time.sleep(3)
self.send_keys('//input[@type="number"]')
time.sleep(3)
while 1:
soup = self.make_soup()
try:
all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr', class_=re.compile(
'el-table__row'))
except:
if soup.find('span', class_='el-table__empty-text') is not None:
print('No Results...')
all_results = []
total_results = int(soup.find('span', class_='el-pagination__total').text.strip().split()[1])
ending_page = total_results // 20 + 1
while 1:
# sometimes it takes long to load all the records on the page, so here making sure we loaded all 20 records or
# if not then making usre it's the last page
soup = self.make_soup()
all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr', class_=re.compile(
'el-table__row'))
if len(all_results) == 20:
break
if len(all_results) < 20 and ending_page == page_num:
break
print(all_results, " : ", total_results, " : ", ending_page)
time.sleep(3)
# each click on the site opens a new window, so here we are switching windows and then closing windows once data read.
for _result in range(len(all_results)):
result = all_results[_result].find_all('td')
if not result:
continue
result_title = result[1].text.strip()
print(page_num, " : ", ending_page, " : ", _result, " / ", len(all_results), " : ",
result_title, " : ", total_results, " : ", ending_page)
print(f"page_num: {page_num} Done!")
page_num += 1
if page_num > ending_page:
break
next_button = soup.find('button', class_='btn-next').attrs
if 'disabled' in next_button:
break
self.clicking('//button[@class="btn-next"]')
time.sleep(3)
# self.close_handles(second_handle, self.main_page_handle)
if __name__ == '__main__':
REY_NUM = 5
next_year = datetime.now().year + 1
url = r'https://www.nmpa.gov.cn/datasearch/search-result.html'
# with Display(visible=0, size=(1920, 1080)) as display:
for _ in range(REY_NUM):
try:
handle = INTERFACING()
soup = handle.get_selenium_response(url)
handle.skip_button("introjs-skipbutton")
soup = handle.make_soup()
if soup.find('div', class_='header-main') is None:
print("访问失败!")
main_page_handle = handle.get_current_handle()
count = 0
while 1:
soup = handle.make_soup()
try:
handle.clicking('//span[text()="Medical Devices"]')
break
except Exception as error:
print('Medical button not yet visible')
try:
all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr',class_=re.compile('el-table__row'))
break
except:
pass
response = handle.skip_button('introjs-skipbutton')
if response:
break
time.sleep(1)
count += 1
if count >= 5:
break
handle.clicking("//*[@class='pc-max el-row']/div/a[@title='一次性使用医疗器械产品']")
for device_type in ["械备", "注进", "注准"]:
for year in range(2020, 2022):
current_query = f'{device_type}{year}'
handle.search_data(current_query,1)
# print(f'{device_type}{_year}')
except Exception as e:
print(f'爬取NMPADisposableProductsRequester数据失败: 详情{e}')
handle.close_driver()
time.sleep(60)
else:
raise Exception(
f"已经重试{REY_NUM}次, 爬取NMPADisposableProductsRequester数据失败, 详情{e}")
跳过弹窗 handle.skip_button("introjs-skipbutton")

之后

class INTERFACING():
def __init__(self): self.driver_initialized = False self.driver = '' self.MAX_TRIALS = 2 # self.chrome_version = get_google_chrome_version()
def make_soup(self): return BeautifulSoup(self.driver.page_source, 'lxml') # etree.HTML()
def current_url(self): return self.driver.current_url
def get_driver(self):
# uc.TARGET_VERSION = get_google_chrome_version() chrome_options = uc.ChromeOptions()
# chrome_options.add_argument("--headless") chrome_options.add_argument("--window-size=1920.,1080") chrome_options.add_argument("--disable-extensions") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--disable-popup-blocking") chrome_options.add_argument("--profile-directory=Default") chrome_options.add_argument("--ignore-certificate-errors") chrome_options.add_argument("--disable-plugins-discovery") chrome_options.add_argument("--incognito") chrome_options.add_argument("--no-first-run") chrome_options.add_argument("--no-service-autorun") chrome_options.add_argument("--no-default-browser-check") chrome_options.add_argument("--password-store=basic") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument('--disable-application-cache') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument("--disable-setuid-sandbox") chrome_options.add_argument( "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36" ) self.driver = uc.Chrome(options=chrome_options, version_main="113") # self.browser = uc.Chrome(options=chrome_options, version_main=113) time.sleep(10) self.driver_initialized = True
def close_driver(self): self.driver.quit()
def get_selenium_response(self, url):
# try: if not self.driver_initialized: self.get_driver() else: pass self.driver.get(url) time.sleep(3) soup = self.make_soup() return soup
def get_page_source(self): return self.driver.page_source
def clicking(self, xpath): elem = self.driver.find_element(By.XPATH, xpath) elem.click() time.sleep(random.randint(2, 3))
def entering_values(self, xpath, value): elem = self.driver.find_element(By.XPATH, xpath) elem.clear() elem.send_keys(value) time.sleep(random.randint(2, 4))
def send_keys(self, xpath): elem = self.driver.find_element(By.XPATH, xpath).send_keys(Keys.RETURN)
def going_back(self): self.driver.execute_script("window.history.go(-1)") time.sleep(1)
def refresh_page(self): self.driver.refresh()
def close_handle(self): self.driver.close()
def get_current_handle(self): return self.driver.current_window_handle
def get_all_handles(self): return self.driver.window_handles
def swtich_to_window(self, handle): self.driver.switch_to.window(handle)
def switch_handle(self, second_handle=''):
all_handles = self.get_all_handles() for handle in all_handles: self.main_page_handle = self.get_current_handle() if handle == self.main_page_handle: continue
if second_handle and handle == second_handle: continue
self.swtich_to_window(handle)
return handle
def close_handles(self, page_handle, second_handle):
all_handles = self.get_all_handles()
for handle in all_handles: if handle == page_handle: try: self.close_handle() except: pass
self.swtich_to_window(second_handle)
def skip_button(self, class_item): count = 0 while 1:
soup = self.make_soup()
try: self.clicking(f'//a[contains(@class,"{class_item}")]') break except Exception as error: print('skip button not yet visible')
if count > 3: try: all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr', class_=re.compile( 'el-table__row')) return True except: if soup.find('span', class_='el-table__empty-text') is not None: return True try: self.clicking('//span[text()="Medical Devices"]') break except: pass
time.sleep(2)
count += 1
if count == 20: break
def search_data(self, current_query,page_num): if page_num == 1: self.entering_values('//*[@id="home"]/main/div[1]/div[7]/div/div[2]/input',current_query) self.clicking('//*[@id="home"]/main/div[1]/div[7]/div/div[2]/div/button') second_handle = self.switch_handle() self.skip_button('introjs-nextbutton') self.skip_button("introjs-skipbutton") soup = self.make_soup()
if soup.find('span', class_='el-table__empty-text') is not None: pass else: print('Selecting 20 per page...') count = 0 while True: soup = self.make_soup() try: page_selector = soup.find('input', class_='el-input__inner') if page_selector.attrs.get("placeholder"): break except Exception as error: print('Record not yet loaded: ', count) time.sleep(3) count += 1 if not count % 3: print('page refreshed....') self.refresh_page() if count >= 51: break self.clicking('//*[@id="home"]/div[3]/div[3]/div/div/span[2]/div/div[1]/input') self.clicking( '//ul[@class="el-scrollbar__view el-select-dropdown__list"]//span[text()="20条/页"]') if page_num != 1: while 1: try: self.entering_values('//input[@type="number"]',page_num) break except: print('error in entering page num')
time.sleep(3) self.send_keys('//input[@type="number"]') time.sleep(3) while 1: soup = self.make_soup()
try: all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr', class_=re.compile( 'el-table__row')) except: if soup.find('span', class_='el-table__empty-text') is not None: print('No Results...') all_results = []
total_results = int(soup.find('span', class_='el-pagination__total').text.strip().split()[1]) ending_page = total_results // 20 + 1
while 1:
# sometimes it takes long to load all the records on the page, so here making sure we loaded all 20 records or # if not then making usre it's the last page soup = self.make_soup() all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr', class_=re.compile( 'el-table__row')) if len(all_results) == 20: break if len(all_results) < 20 and ending_page == page_num: break print(all_results, " : ", total_results, " : ", ending_page) time.sleep(3)
# each click on the site opens a new window, so here we are switching windows and then closing windows once data read. for _result in range(len(all_results)): result = all_results[_result].find_all('td') if not result: continue
result_title = result[1].text.strip()
print(page_num, " : ", ending_page, " : ", _result, " / ", len(all_results), " : ", result_title, " : ", total_results, " : ", ending_page)
print(f"page_num: {page_num} Done!") page_num += 1 if page_num > ending_page: break next_button = soup.find('button', class_='btn-next').attrs if 'disabled' in next_button: break self.clicking('//button[@class="btn-next"]') time.sleep(3) # self.close_handles(second_handle, self.main_page_handle)
if __name__ == '__main__': REY_NUM = 5 next_year = datetime.now().year + 1 url = r'https://www.nmpa.gov.cn/datasearch/search-result.html' # with Display(visible=0, size=(1920, 1080)) as display: for _ in range(REY_NUM): try: handle = INTERFACING() soup = handle.get_selenium_response(url) handle.skip_button("introjs-skipbutton")
soup = handle.make_soup()
if soup.find('div', class_='header-main') is None: print("访问失败!")
main_page_handle = handle.get_current_handle()
count = 0 while 1: soup = handle.make_soup()
try: handle.clicking('//span[text()="Medical Devices"]') break except Exception as error: print('Medical button not yet visible')
try: all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr',class_=re.compile('el-table__row')) break except: pass
response = handle.skip_button('introjs-skipbutton')
if response: break
time.sleep(1)
count += 1
if count >= 5: break handle.clicking("//*[@class='pc-max el-row']/div/a[@title='一次性使用医疗器械产品']") for device_type in ["械备", "注进", "注准"]: for year in range(2020, 2022): current_query = f'{device_type}{year}' handle.search_data(current_query,1) # print(f'{device_type}{_year}')
except Exception as e: print(f'爬取NMPADisposableProductsRequester数据失败: 详情{e}')
handle.close_driver() time.sleep(60) else: raise Exception( f"已经重试{REY_NUM}次, 爬取NMPADisposableProductsRequester数据失败, 详情{e}")
浙公网安备 33010602011771号