\本文涉及到的知识有postgresql数据库使用,文件操作,以及爬虫相关知识,用到了request2、BeautifulSoup两个模块
\pip install request2,pip install Beautifulsoup4,pip install htmllib5 安装三个模块
\站在别人的肩上,虽然很简单,但还是折腾了半天,主要是数据库管理软件的使用没有完全掌握,建议读者遇到不懂的地方一定要及时查阅相关资料,读一篇文章就要将其读懂,慢慢积累,定会有收获!
1 import psycopg2
2 import requests
3 from bs4 import BeautifulSoup \注意这里,导入的时候特别注意
4
5
6 def create_table(): \创建数据库
7 conn = psycopg2.connect(database="test", user="postgres", password="123", host="127.0.0.1", port="5432")
8 sql = "create table if not exists article ( article_title text ,article_author text ,article_content text)"
9 cur = conn.cursor()
10 try:
11 cur.execute(sql)
12 conn.commit()
13 print("create table success")
14 except BaseException as e:
15 conn.rollback()
16 print(e)
17 finally:
18 cur.close()
19 conn.close()
20
21
22 def get_html_data(): \获得网页内容
23 response = requests.get("http://meiriyiwen.com/random")
24 soup = BeautifulSoup(response.content, "html5lib") \BeautifulSoup的功能还是比较强大的,htmllib5用来解析网页。
25 article = soup.find("div", id='article_show')
26 article_title = article.h1.string
27 article_author = article.find("p", class_="article_author").string
28 article_contents = article.find("div", class_="article_text").find_all("p")
29 article_content = ""
30 for content in article_contents:
31 article_content = article_content + str(content)
32 insert_table(article_title, article_author, article_content)
33
34
35 def insert_table(article_title, article_author, article_content): \ 将获取到的内容存入网页
36 conn = psycopg2.connect(database="test", user="postgres", password="123", host="127.0.0.1", port="5432")
37 query_sql = "select * from article WHERE article_title=%s"
38 sql = "insert into article (article_title,article_author,article_content) VALUES (%s,%s,%s)"
39 cur = conn.cursor()
40 try:
41 query_value = (article_title,)
42 cur.execute(query_sql, query_value)
43 results = cur.fetchall()
44 if len(results) == 0:
45 sql_values = (article_title, article_author, article_content)
46 cur.execute(sql, sql_values)
47 conn.commit()
48 return True
49 else:
50 return False
51 except BaseException as e:
52 conn.rollback()
53 print(e)
54 finally:
55 cur.close()
56 conn.close()
57
58
59 if __name__ == "__main__":
60 create_table() \这里第一次使用加的,创建数据库之后将其删掉!
61 get_html_data()