selenium click skip_button("introjs-skipbutton")

class INTERFACING():

    def __init__(self):
        self.driver_initialized = False
        self.driver = ''
        self.MAX_TRIALS = 2
        # self.chrome_version = get_google_chrome_version()

    def make_soup(self):
        return BeautifulSoup(self.driver.page_source, 'lxml')  # etree.HTML()

    def current_url(self):
        return self.driver.current_url

    def get_driver(self):

        # uc.TARGET_VERSION = get_google_chrome_version()
        chrome_options = uc.ChromeOptions()

        # chrome_options.add_argument("--headless")
        chrome_options.add_argument("--window-size=1920.,1080")
        chrome_options.add_argument("--disable-extensions")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-popup-blocking")
        chrome_options.add_argument("--profile-directory=Default")
        chrome_options.add_argument("--ignore-certificate-errors")
        chrome_options.add_argument("--disable-plugins-discovery")
        chrome_options.add_argument("--incognito")
        chrome_options.add_argument("--no-first-run")
        chrome_options.add_argument("--no-service-autorun")
        chrome_options.add_argument("--no-default-browser-check")
        chrome_options.add_argument("--password-store=basic")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument('--disable-application-cache')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument("--disable-setuid-sandbox")
        chrome_options.add_argument(
            "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
        )
        self.driver = uc.Chrome(options=chrome_options, version_main="113")
        # self.browser = uc.Chrome(options=chrome_options, version_main=113)
        time.sleep(10)
        self.driver_initialized = True

    def close_driver(self):
        self.driver.quit()

    def get_selenium_response(self, url):

        # try:
        if not self.driver_initialized:
            self.get_driver()
        else:
            pass
        self.driver.get(url)
        time.sleep(3)
        soup = self.make_soup()
        return soup

    def get_page_source(self):
        return self.driver.page_source

    def clicking(self, xpath):
        elem = self.driver.find_element(By.XPATH, xpath)
        elem.click()
        time.sleep(random.randint(2, 3))

    def entering_values(self, xpath, value):
        elem = self.driver.find_element(By.XPATH, xpath)
        elem.clear()
        elem.send_keys(value)
        time.sleep(random.randint(2, 4))

    def send_keys(self, xpath):
        elem = self.driver.find_element(By.XPATH, xpath).send_keys(Keys.RETURN)

    def going_back(self):
        self.driver.execute_script("window.history.go(-1)")
        time.sleep(1)

    def refresh_page(self):
        self.driver.refresh()

    def close_handle(self):
        self.driver.close()

    def get_current_handle(self):
        return self.driver.current_window_handle

    def get_all_handles(self):
        return self.driver.window_handles

    def swtich_to_window(self, handle):
        self.driver.switch_to.window(handle)

    def switch_handle(self, second_handle=''):

        all_handles = self.get_all_handles()
        for handle in all_handles:
            self.main_page_handle = self.get_current_handle()
            if handle == self.main_page_handle:
                continue

            if second_handle and handle == second_handle:
                continue

            self.swtich_to_window(handle)

            return handle

    def close_handles(self, page_handle, second_handle):

        all_handles = self.get_all_handles()

        for handle in all_handles:
            if handle == page_handle:
                try:
                    self.close_handle()
                except:
                    pass

        self.swtich_to_window(second_handle)

    def skip_button(self, class_item):
        count = 0
        while 1:

            soup = self.make_soup()

            try:
                self.clicking(f'//a[contains(@class,"{class_item}")]')
                break
            except Exception as error:
                print('skip button not yet visible')

            if count > 3:
                try:
                    all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr', class_=re.compile(
                        'el-table__row'))
                    return True
                except:
                    if soup.find('span', class_='el-table__empty-text') is not None:
                        return True
            try:
                self.clicking('//span[text()="Medical Devices"]')
                break
            except:
                pass

            time.sleep(2)

            count += 1

            if count == 20:
                break




    def search_data(self, current_query,page_num):
        if page_num == 1:
            self.entering_values('//*[@id="home"]/main/div[1]/div[7]/div/div[2]/input',current_query)
            self.clicking('//*[@id="home"]/main/div[1]/div[7]/div/div[2]/div/button')
            second_handle = self.switch_handle()
            self.skip_button('introjs-nextbutton')
            self.skip_button("introjs-skipbutton")
            soup = self.make_soup()

            if soup.find('span', class_='el-table__empty-text') is not None:
                pass
            else:
                print('Selecting 20 per page...')
                count = 0
                while True:
                    soup = self.make_soup()
                    try:
                        page_selector = soup.find('input', class_='el-input__inner')
                        if page_selector.attrs.get("placeholder"):
                            break
                    except Exception as error:
                        print('Record not yet loaded: ', count)
                    time.sleep(3)
                    count += 1
                    if not count % 3:
                        print('page refreshed....')
                        self.refresh_page()
                    if count >= 51:
                        break
                self.clicking('//*[@id="home"]/div[3]/div[3]/div/div/span[2]/div/div[1]/input')
                self.clicking(
                    '//ul[@class="el-scrollbar__view el-select-dropdown__list"]//span[text()="20条/页"]')
        if page_num != 1:
            while 1:
                try:
                    self.entering_values('//input[@type="number"]',page_num)
                    break
                except:
                    print('error in entering page num')

                time.sleep(3)
            self.send_keys('//input[@type="number"]')
            time.sleep(3)
        while 1:
            soup = self.make_soup()

            try:
                all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr', class_=re.compile(
                    'el-table__row'))
            except:
                if soup.find('span', class_='el-table__empty-text') is not None:
                    print('No Results...')
                    all_results = []

            total_results = int(soup.find('span', class_='el-pagination__total').text.strip().split()[1])
            ending_page = total_results // 20 + 1

            while 1:

                # sometimes it takes long to load all the records on the page, so here making sure we loaded all 20 records or
                # if not then making usre it's the last page
                soup = self.make_soup()
                all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr', class_=re.compile(
                    'el-table__row'))
                if len(all_results) == 20:
                    break
                if len(all_results) < 20 and ending_page == page_num:
                    break
                print(all_results, " : ", total_results, " : ", ending_page)
                time.sleep(3)

            # each click on the site opens a new window, so here we are switching windows and then closing windows once data read.
            for _result in range(len(all_results)):
                result = all_results[_result].find_all('td')
                if not result:
                    continue

                result_title = result[1].text.strip()

                print(page_num, " : ", ending_page, " : ", _result, " / ", len(all_results), " : ",
                      result_title, " : ", total_results, " : ", ending_page)

            print(f"page_num: {page_num} Done!")
            page_num += 1
            if page_num > ending_page:
                break
            next_button = soup.find('button', class_='btn-next').attrs
            if 'disabled' in next_button:
                break
            self.clicking('//button[@class="btn-next"]')
            time.sleep(3)
        # self.close_handles(second_handle, self.main_page_handle)




if __name__ == '__main__':
    REY_NUM = 5
    next_year = datetime.now().year + 1
    url = r'https://www.nmpa.gov.cn/datasearch/search-result.html'
    # with Display(visible=0, size=(1920, 1080)) as display:
    for _ in range(REY_NUM):
        try:
            handle = INTERFACING()
            soup = handle.get_selenium_response(url)
            handle.skip_button("introjs-skipbutton")

            soup = handle.make_soup()

            if soup.find('div', class_='header-main') is None:
                print("访问失败！")

            main_page_handle = handle.get_current_handle()

            count = 0
            while 1:
                soup = handle.make_soup()

                try:
                    handle.clicking('//span[text()="Medical Devices"]')
                    break
                except Exception as error:
                    print('Medical button not yet visible')

                try:
                    all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr',class_=re.compile('el-table__row'))
                    break
                except:
                    pass

                response = handle.skip_button('introjs-skipbutton')

                if response:
                    break

                time.sleep(1)

                count += 1

                if count >= 5:
                    break
            handle.clicking("//*[@class='pc-max el-row']/div/a[@title='一次性使用医疗器械产品']")
            for device_type in ["械备", "注进", "注准"]:
                for year in range(2020, 2022):
                    current_query = f'{device_type}{year}'
                    handle.search_data(current_query,1)
                    # print(f'{device_type}{_year}')



        except Exception as e:
            print(f'爬取NMPADisposableProductsRequester数据失败: 详情{e}')

        handle.close_driver()
        time.sleep(60)
    else:
        raise Exception(
            f"已经重试{REY_NUM}次, 爬取NMPADisposableProductsRequester数据失败, 详情{e}")

　　跳过弹窗 handle.skip_button("introjs-skipbutton")

之后

class INTERFACING():
def __init__(self): self.driver_initialized = False self.driver = '' self.MAX_TRIALS = 2 # self.chrome_version = get_google_chrome_version()
def make_soup(self): return BeautifulSoup(self.driver.page_source, 'lxml') # etree.HTML()
def current_url(self): return self.driver.current_url
def get_driver(self):
# uc.TARGET_VERSION = get_google_chrome_version() chrome_options = uc.ChromeOptions()
# chrome_options.add_argument("--headless") chrome_options.add_argument("--window-size=1920.,1080") chrome_options.add_argument("--disable-extensions") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--disable-popup-blocking") chrome_options.add_argument("--profile-directory=Default") chrome_options.add_argument("--ignore-certificate-errors") chrome_options.add_argument("--disable-plugins-discovery") chrome_options.add_argument("--incognito") chrome_options.add_argument("--no-first-run") chrome_options.add_argument("--no-service-autorun") chrome_options.add_argument("--no-default-browser-check") chrome_options.add_argument("--password-store=basic") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument('--disable-application-cache') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument("--disable-setuid-sandbox") chrome_options.add_argument( "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36" ) self.driver = uc.Chrome(options=chrome_options, version_main="113") # self.browser = uc.Chrome(options=chrome_options, version_main=113) time.sleep(10) self.driver_initialized = True
def close_driver(self): self.driver.quit()
def get_selenium_response(self, url):
# try: if not self.driver_initialized: self.get_driver() else: pass self.driver.get(url) time.sleep(3) soup = self.make_soup() return soup
def get_page_source(self): return self.driver.page_source
def clicking(self, xpath): elem = self.driver.find_element(By.XPATH, xpath) elem.click() time.sleep(random.randint(2, 3))
def entering_values(self, xpath, value): elem = self.driver.find_element(By.XPATH, xpath) elem.clear() elem.send_keys(value) time.sleep(random.randint(2, 4))
def send_keys(self, xpath): elem = self.driver.find_element(By.XPATH, xpath).send_keys(Keys.RETURN)
def going_back(self): self.driver.execute_script("window.history.go(-1)") time.sleep(1)
def refresh_page(self): self.driver.refresh()
def close_handle(self): self.driver.close()
def get_current_handle(self): return self.driver.current_window_handle
def get_all_handles(self): return self.driver.window_handles
def swtich_to_window(self, handle): self.driver.switch_to.window(handle)
def switch_handle(self, second_handle=''):
all_handles = self.get_all_handles() for handle in all_handles: self.main_page_handle = self.get_current_handle() if handle == self.main_page_handle: continue
if second_handle and handle == second_handle: continue
self.swtich_to_window(handle)
return handle
def close_handles(self, page_handle, second_handle):
all_handles = self.get_all_handles()
for handle in all_handles: if handle == page_handle: try: self.close_handle() except: pass
self.swtich_to_window(second_handle)
def skip_button(self, class_item): count = 0 while 1:
soup = self.make_soup()
try: self.clicking(f'//a[contains(@class,"{class_item}")]') break except Exception as error: print('skip button not yet visible')
if count > 3: try: all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr', class_=re.compile( 'el-table__row')) return True except: if soup.find('span', class_='el-table__empty-text') is not None: return True try: self.clicking('//span[text()="Medical Devices"]') break except: pass
time.sleep(2)
count += 1
if count == 20: break

def search_data(self, current_query,page_num): if page_num == 1: self.entering_values('//*[@id="home"]/main/div[1]/div[7]/div/div[2]/input',current_query) self.clicking('//*[@id="home"]/main/div[1]/div[7]/div/div[2]/div/button') second_handle = self.switch_handle() self.skip_button('introjs-nextbutton') self.skip_button("introjs-skipbutton") soup = self.make_soup()
if soup.find('span', class_='el-table__empty-text') is not None: pass else: print('Selecting 20 per page...') count = 0 while True: soup = self.make_soup() try: page_selector = soup.find('input', class_='el-input__inner') if page_selector.attrs.get("placeholder"): break except Exception as error: print('Record not yet loaded: ', count) time.sleep(3) count += 1 if not count % 3: print('page refreshed....') self.refresh_page() if count >= 51: break self.clicking('//*[@id="home"]/div[3]/div[3]/div/div/span[2]/div/div[1]/input') self.clicking( '//ul[@class="el-scrollbar__view el-select-dropdown__list"]//span[text()="20条/页"]') if page_num != 1: while 1: try: self.entering_values('//input[@type="number"]',page_num) break except: print('error in entering page num')
time.sleep(3) self.send_keys('//input[@type="number"]') time.sleep(3) while 1: soup = self.make_soup()
try: all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr', class_=re.compile( 'el-table__row')) except: if soup.find('span', class_='el-table__empty-text') is not None: print('No Results...') all_results = []
total_results = int(soup.find('span', class_='el-pagination__total').text.strip().split()[1]) ending_page = total_results // 20 + 1
while 1:
# sometimes it takes long to load all the records on the page, so here making sure we loaded all 20 records or # if not then making usre it's the last page soup = self.make_soup() all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr', class_=re.compile( 'el-table__row')) if len(all_results) == 20: break if len(all_results) < 20 and ending_page == page_num: break print(all_results, " : ", total_results, " : ", ending_page) time.sleep(3)
# each click on the site opens a new window, so here we are switching windows and then closing windows once data read. for _result in range(len(all_results)): result = all_results[_result].find_all('td') if not result: continue
result_title = result[1].text.strip()
print(page_num, " : ", ending_page, " : ", _result, " / ", len(all_results), " : ", result_title, " : ", total_results, " : ", ending_page)
print(f"page_num: {page_num} Done!") page_num += 1 if page_num > ending_page: break next_button = soup.find('button', class_='btn-next').attrs if 'disabled' in next_button: break self.clicking('//button[@class="btn-next"]') time.sleep(3) # self.close_handles(second_handle, self.main_page_handle)

if __name__ == '__main__': REY_NUM = 5 next_year = datetime.now().year + 1 url = r'https://www.nmpa.gov.cn/datasearch/search-result.html' # with Display(visible=0, size=(1920, 1080)) as display: for _ in range(REY_NUM): try: handle = INTERFACING() soup = handle.get_selenium_response(url) handle.skip_button("introjs-skipbutton")
soup = handle.make_soup()
if soup.find('div', class_='header-main') is None: print("访问失败！")
main_page_handle = handle.get_current_handle()
count = 0 while 1: soup = handle.make_soup()
try: handle.clicking('//span[text()="Medical Devices"]') break except Exception as error: print('Medical button not yet visible')
try: all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr',class_=re.compile('el-table__row')) break except: pass
response = handle.skip_button('introjs-skipbutton')
if response: break
time.sleep(1)
count += 1
if count >= 5: break handle.clicking("//*[@class='pc-max el-row']/div/a[@title='一次性使用医疗器械产品']") for device_type in ["械备", "注进", "注准"]: for year in range(2020, 2022): current_query = f'{device_type}{year}' handle.search_data(current_query,1) # print(f'{device_type}{_year}')

except Exception as e: print(f'爬取NMPADisposableProductsRequester数据失败: 详情{e}')
handle.close_driver() time.sleep(60) else: raise Exception( f"已经重试{REY_NUM}次, 爬取NMPADisposableProductsRequester数据失败, 详情{e}")

posted on 2023-06-05 16:09 明媚的夏午阅读(121) 评论(0) 收藏举报

刷新页面返回顶部

导航

selenium click skip_button("introjs-skipbutton")