from urllib2 import urlopen
from bs4 import BeautifulSoup
# Get the next page url from the current page url
def get_next_page_url(url):
page = urlopen(url)
soup_page = BeautifulSoup(page, 'lxml')
page.close()
# Get current page and next page tag
current_page_tag = soup_page.find(class_="current")
next_page_tag = current_page_tag.find_next_sibling()
# Check if the current page is the last one
if next_page_tag is None:
next_page_url = None
else:
next_page_url = next_page_tag['href']
return next_page_url
# Get the book detail urls by page url
def get_book_detail_urls(url):
page = urlopen(url)
soup = BeautifulSoup(page, 'lxml')
page.close()
urls = []
book_header_tags = soup.find_all(class_="entry-title")
for book_header_tag in book_header_tags:
urls.append(book_header_tag.a['href'])
return urls
# Get the book detail info by book detail url
def get_book_isbn(url):
page = urlopen(url)
book_isbn_soup = BeautifulSoup(page, 'lxml')
page.close()
title_tag = book_isbn_soup.find(class_="single-title")
title = title_tag.string
isbn_key_tag = book_isbn_soup.find(text="ISBN-10:").parent
isbn_tag = isbn_key_tag.find_next_sibling()
isbn = isbn_tag.string.strip() # Remove the whitespace with the strip method
return { 'title': title, 'ISBN': isbn }
def start():
url = "http://www.allitebooks.com/certification/"
book_info_list = []
def next_page(page_url):
book_detail_urls = get_book_detail_urls(page_url)
for book_detail_url in book_detail_urls: #print all books ISBN one by one
# print(book_detail_url)
book_info = get_book_isbn(book_detail_url)
print(book_info) # print ISBD
book_info_list.append(book_info)
next_page_url = get_next_page_url(page_url)
if next_page_url is not None:
next_page(next_page_url)
else:
return 0
next_page(url)