数据探查postgresql数据库

  1 import pandas as pd
  2 import xlrd
  3 import psycopg2
  4 
  5 class Expedition(object):
  6     def __init__(self, database, user, psd, ip, port):
  7         self.database = database
  8         self.user = user
  9         self.psd = psd
 10         self.ip = ip
 11         self.port = port
 12         self.tables = []
 13         self.schema = 'schema名'
 14 
 15     def get_table_info(self):
 16         dblink = psycopg2.connect(database=self.database, user=self.user, password=self.psd, host=self.ip, port=self.port)
 17         print("链接成功")
 18         cur = dblink.cursor()
 19         print("创建游标成功")
 20         for table in self.tables:
 21             print(table)
 22             cur.execute(f"""SELECT cast(obj_description(relfilenode,'pg_class') as varchar)  FROM    pg_class b 
 23 WHERE  relname = '{table}'""")
 24             table_value = cur.fetchall()
 25             if table_value == []:
 26                 continue
 27             # 获取表备注
 28             table_comment = table_value[0][0]
 29             print (f"获取表备注{table_comment}成功")
 30 
 31             # 获取字段列表
 32             cur.execute(f"""select column_name  from information_schema.columns where  table_catalog ='{self.database}' and table_name = '{table}'""")
 33             columns = cur.fetchall()
 34             print(f"获取字段列表:{columns}成功")
 35             if columns == []:
 36                 continue
 37             
 38             # 从字段列表中取出包裹在集合的字段名
 39             for column in columns:
 40                 # 得到字段名
 41                 column_name = column[0]
 42                 print (f"获得字段成功{column_name}")
 43 
 44                 # 获得该字段的备注信息和字段类型
 45                 print(f"""SELECT col_description(a.attrelid,a.attnum),format_type(a.atttypid,a.atttypmod) as type
 46 FROM pg_class as c,pg_attribute as a where c.relname = '{table}' and a.attrelid = c.oid and a.attnum>0 and a.attname = '{column_name}'""")
 47                 cur.execute(f"""SELECT cast(col_description(a.attrelid,a.attnum)as varchar),format_type(a.atttypid,a.atttypmod) as type
 48 FROM pg_class as c,pg_attribute as a where c.relname = '{table}' and a.attrelid = c.oid and a.attnum>0 and a.attname = '{column_name}'""")
 49                 comment = cur.fetchall()
 50                 
 51                 # 获得该字段备注和字段类型列表
 52                 print(f"获得备注类型列表:{comment}")
 53 
 54                 # 获得字段的备注和类型
 55                 column_comment = comment[0][0]
 56                 column_type = comment[0][1]
 57                 print(f"获得字段备注:{column_comment}成功,获得字段类型成功:{column_type}")
 58 
 59                 # 获得记录数
 60                 print(f"SELECT count(1),count(DISTINCT cast({column_name} as varchar)) from {self.schema}.{table}")
 61                 cur.execute(f"""SELECT count(1),count(DISTINCT cast("{column_name}" as varchar)) from {self.schema}.{table}""")
 62                 field_num = cur.fetchall()
 63                 all_num = field_num[0][0]
 64                 only_num = field_num[0][1]
 65                 print(f"记录数:{all_num},唯一记录数:{only_num}")
 66 
 67                 # 唯一率
 68                 try:
 69 
 70                     only_rate = only_num / all_num
 71                 except Exception as result:
 72                     print(result)
 73                     only_rate = 0
 74 
 75 
 76                 # 空值数量
 77                 try:
 78                     print(f"SELECT count(1) from self.schema where CAST({column_name} as varchar) != ''")
 79                     cur.execute(f"""SELECT count(1) from {self.schema}.{table}
 80 where cast("{column_name}" as varchar) = ''or "{column_name}" is null""")
 81                     null_resule = cur.fetchall()
 82                     null_num = null_resule[0][0]
 83 
 84                     null_rate = (all_num - null_num) / all_num
 85                     print(f"获得null值数量;{null_num}成功")
 86                 except Exception as result:
 87                     print(result)
 88                     null_num = "未获得数据"
 89                     null_rate = "未获得数据"
 90 
 91                 # # 获取最大值/最小值
 92                 
 93                 if column_type.startswith("i") or column_type.startswith("f") or column_type.startswith("do") or column_type.startswith("n") or column_type.startswith("bi"):
 94                     try:
 95                         print(
 96                             f"select MAX(cast({table}.{column_name} as varchar)),MIN(cast({table}.{column_name} as varchar)) from {self.schema}.{table}")
 97                         cur.execute(
 98                             f"""select MAX(cast({table}."{column_name}" as float)),MIN(cast({table}."{column_name}" as float)) from {self.schema}.{table}""")
 99                         extreme_result = cur.fetchall()
100                         max_value = extreme_result[0][0]
101                         min_value = extreme_result[0][1]
102                         print(extreme_result)
103                         print(f"最大值:{max_value},最小值:{min_value}")
104                     except:
105 
106                         print("错误信息")
107 
108                 else:
109                     try:
110                         print(
111                             f"select MAX(cast({table}.{column_name} as varchar)),MIN(cast({table}.{column_name} as varchar)) from {self.schema}.{table}")
112                         cur.execute(
113                             f"""select MAX(cast({table}."{column_name}" as varchar)),MIN(cast({table}."{column_name}" as varchar)) from {self.schema}.{table}""")
114                         extreme_result = cur.fetchall()
115                         max_value = extreme_result[0][0]
116                         min_value = extreme_result[0][1]
117                         print(extreme_result)
118                         print(f"最大值:{max_value},最小值:{min_value}")
119                     except:
120 
121                         print("错误信息")
122                 
123                 # # 获取最大长度和最小长度
124                 try:
125                     cur.execute(f"""SELECT max(char_length(CAST("{column_name}" as VARCHAR))),\
126                 min(char_length(CAST("{column_name}" as VARCHAR))) from {self.schema}.{table}""")
127                     column_lengtn = cur.fetchall()
128                     max_lenth = column_lengtn[0][0]
129                     min_lenth = column_lengtn[0][1]
130                     print(f"最大长度:{max_lenth},最小长度:{min_lenth}")
131                 except Exception as err:
132                     max_lenth = "未获得数据"
133                     min_lenth = "未获得数据"
134                 
135                 # 获得样例数据
136                 wordlist = []
137                 try:
138                     print(f"""SELECT distinct cast({column_name} as varchar) from {self.schema}.{table} where CAST('{column_name}' as varchar) != ''
139 LIMIT 4""")
140                     cur.execute(f"""SELECT distinct cast("{column_name}" as varchar) from {self.schema}.{table} where CAST("{column_name}" as varchar) != ''
141 LIMIT 4""")
142                     words = cur.fetchall()
143                     for word in words:
144                         print(word)
145                         wordlist.append(word[0])
146                     print(wordlist)
147                 except:
148                     wordlist = ['未获得数据']
149                 
150 
151                
152                 
153 
154                 # print(table, table_comment, column_name, column_comment, column_type, all_num, null_num, only_num, only_rate, max_value, min_value, max_lenth, min_lenth, f"{wordlist}")
155                 # 调用写入
156                 self.write_into_excel(table, table_comment, column_name, column_comment, column_type, all_num, null_num, null_rate, only_num, \
157                                       only_rate, max_value, min_value, max_lenth, min_lenth, f"{wordlist}")
158                 print("写入成功")
159             
160             
161 
162 
163 
164     # 从表格获取表明
165     def get_excel_value(self):
166         # 打开
167         data = xlrd.open_workbook(r"C:\Users\Administrator\Desktop\未探查总.xlsx")
168         # 获取sheet对象
169         table = data.sheet_by_name(U"Sheet1")
170 
171         # row = table.row_values(0)
172         # 读取第一列数据,返回一个列表
173         col = table.col_values(0)
174         self.tables = col
175         print (f"共从excel中读取{len(self.tables)}个表名")
176 
177 
178     # 将数据写入表格
179     def write_into_excel(self, table_name, table_comment, column_name, column_comment, column_type, all_num, null_num, null_rate, only_num, only_rate, max_value, min_value,max_lenth,min_lenth,word_list):
180 
181         df = pd.DataFrame(pd.read_excel('C:\\Users\Administrator\Desktop\探查模板.xlsx'))  # 读取原数据
182         df_rows = df.shape[0]  # 获取行数
183         # 增加一条数据
184         df.loc[df_rows] = [table_name, table_comment, column_name, column_comment, column_type, all_num, null_num,null_rate, only_num, only_rate, max_value, min_value, max_lenth, min_lenth, word_list]  # 与原数据同格式
185         df.to_excel('C:\\Users\Administrator\Desktop\探查模板.xlsx', sheet_name='sheet1', index=False, header=True)
186 
187     
188 
189 
190 if __name__=='__main__':
191     run = Expedition("数据库名", "账户名", "密码", "ip地址", 端口号)
192     run.get_excel_value()
193     run.get_table_info()

 

posted @ 2020-03-17 09:53  芦苇了嘿  阅读(348)  评论(1编辑  收藏  举报