1 import pandas as pd
2 import xlrd
3 import psycopg2
4
5 class Expedition(object):
6 def __init__(self, database, user, psd, ip, port):
7 self.database = database
8 self.user = user
9 self.psd = psd
10 self.ip = ip
11 self.port = port
12 self.tables = []
13 self.schema = 'schema名'
14
15 def get_table_info(self):
16 dblink = psycopg2.connect(database=self.database, user=self.user, password=self.psd, host=self.ip, port=self.port)
17 print("链接成功")
18 cur = dblink.cursor()
19 print("创建游标成功")
20 for table in self.tables:
21 print(table)
22 cur.execute(f"""SELECT cast(obj_description(relfilenode,'pg_class') as varchar) FROM pg_class b
23 WHERE relname = '{table}'""")
24 table_value = cur.fetchall()
25 if table_value == []:
26 continue
27 # 获取表备注
28 table_comment = table_value[0][0]
29 print (f"获取表备注{table_comment}成功")
30
31 # 获取字段列表
32 cur.execute(f"""select column_name from information_schema.columns where table_catalog ='{self.database}' and table_name = '{table}'""")
33 columns = cur.fetchall()
34 print(f"获取字段列表:{columns}成功")
35 if columns == []:
36 continue
37
38 # 从字段列表中取出包裹在集合的字段名
39 for column in columns:
40 # 得到字段名
41 column_name = column[0]
42 print (f"获得字段成功{column_name}")
43
44 # 获得该字段的备注信息和字段类型
45 print(f"""SELECT col_description(a.attrelid,a.attnum),format_type(a.atttypid,a.atttypmod) as type
46 FROM pg_class as c,pg_attribute as a where c.relname = '{table}' and a.attrelid = c.oid and a.attnum>0 and a.attname = '{column_name}'""")
47 cur.execute(f"""SELECT cast(col_description(a.attrelid,a.attnum)as varchar),format_type(a.atttypid,a.atttypmod) as type
48 FROM pg_class as c,pg_attribute as a where c.relname = '{table}' and a.attrelid = c.oid and a.attnum>0 and a.attname = '{column_name}'""")
49 comment = cur.fetchall()
50
51 # 获得该字段备注和字段类型列表
52 print(f"获得备注类型列表:{comment}")
53
54 # 获得字段的备注和类型
55 column_comment = comment[0][0]
56 column_type = comment[0][1]
57 print(f"获得字段备注:{column_comment}成功,获得字段类型成功:{column_type}")
58
59 # 获得记录数
60 print(f"SELECT count(1),count(DISTINCT cast({column_name} as varchar)) from {self.schema}.{table}")
61 cur.execute(f"""SELECT count(1),count(DISTINCT cast("{column_name}" as varchar)) from {self.schema}.{table}""")
62 field_num = cur.fetchall()
63 all_num = field_num[0][0]
64 only_num = field_num[0][1]
65 print(f"记录数:{all_num},唯一记录数:{only_num}")
66
67 # 唯一率
68 try:
69
70 only_rate = only_num / all_num
71 except Exception as result:
72 print(result)
73 only_rate = 0
74
75
76 # 空值数量
77 try:
78 print(f"SELECT count(1) from self.schema where CAST({column_name} as varchar) != ''")
79 cur.execute(f"""SELECT count(1) from {self.schema}.{table}
80 where cast("{column_name}" as varchar) = ''or "{column_name}" is null""")
81 null_resule = cur.fetchall()
82 null_num = null_resule[0][0]
83
84 null_rate = (all_num - null_num) / all_num
85 print(f"获得null值数量;{null_num}成功")
86 except Exception as result:
87 print(result)
88 null_num = "未获得数据"
89 null_rate = "未获得数据"
90
91 # # 获取最大值/最小值
92
93 if column_type.startswith("i") or column_type.startswith("f") or column_type.startswith("do") or column_type.startswith("n") or column_type.startswith("bi"):
94 try:
95 print(
96 f"select MAX(cast({table}.{column_name} as varchar)),MIN(cast({table}.{column_name} as varchar)) from {self.schema}.{table}")
97 cur.execute(
98 f"""select MAX(cast({table}."{column_name}" as float)),MIN(cast({table}."{column_name}" as float)) from {self.schema}.{table}""")
99 extreme_result = cur.fetchall()
100 max_value = extreme_result[0][0]
101 min_value = extreme_result[0][1]
102 print(extreme_result)
103 print(f"最大值:{max_value},最小值:{min_value}")
104 except:
105
106 print("错误信息")
107
108 else:
109 try:
110 print(
111 f"select MAX(cast({table}.{column_name} as varchar)),MIN(cast({table}.{column_name} as varchar)) from {self.schema}.{table}")
112 cur.execute(
113 f"""select MAX(cast({table}."{column_name}" as varchar)),MIN(cast({table}."{column_name}" as varchar)) from {self.schema}.{table}""")
114 extreme_result = cur.fetchall()
115 max_value = extreme_result[0][0]
116 min_value = extreme_result[0][1]
117 print(extreme_result)
118 print(f"最大值:{max_value},最小值:{min_value}")
119 except:
120
121 print("错误信息")
122
123 # # 获取最大长度和最小长度
124 try:
125 cur.execute(f"""SELECT max(char_length(CAST("{column_name}" as VARCHAR))),\
126 min(char_length(CAST("{column_name}" as VARCHAR))) from {self.schema}.{table}""")
127 column_lengtn = cur.fetchall()
128 max_lenth = column_lengtn[0][0]
129 min_lenth = column_lengtn[0][1]
130 print(f"最大长度:{max_lenth},最小长度:{min_lenth}")
131 except Exception as err:
132 max_lenth = "未获得数据"
133 min_lenth = "未获得数据"
134
135 # 获得样例数据
136 wordlist = []
137 try:
138 print(f"""SELECT distinct cast({column_name} as varchar) from {self.schema}.{table} where CAST('{column_name}' as varchar) != ''
139 LIMIT 4""")
140 cur.execute(f"""SELECT distinct cast("{column_name}" as varchar) from {self.schema}.{table} where CAST("{column_name}" as varchar) != ''
141 LIMIT 4""")
142 words = cur.fetchall()
143 for word in words:
144 print(word)
145 wordlist.append(word[0])
146 print(wordlist)
147 except:
148 wordlist = ['未获得数据']
149
150
151
152
153
154 # print(table, table_comment, column_name, column_comment, column_type, all_num, null_num, only_num, only_rate, max_value, min_value, max_lenth, min_lenth, f"{wordlist}")
155 # 调用写入
156 self.write_into_excel(table, table_comment, column_name, column_comment, column_type, all_num, null_num, null_rate, only_num, \
157 only_rate, max_value, min_value, max_lenth, min_lenth, f"{wordlist}")
158 print("写入成功")
159
160
161
162
163
164 # 从表格获取表明
165 def get_excel_value(self):
166 # 打开
167 data = xlrd.open_workbook(r"C:\Users\Administrator\Desktop\未探查总.xlsx")
168 # 获取sheet对象
169 table = data.sheet_by_name(U"Sheet1")
170
171 # row = table.row_values(0)
172 # 读取第一列数据,返回一个列表
173 col = table.col_values(0)
174 self.tables = col
175 print (f"共从excel中读取{len(self.tables)}个表名")
176
177
178 # 将数据写入表格
179 def write_into_excel(self, table_name, table_comment, column_name, column_comment, column_type, all_num, null_num, null_rate, only_num, only_rate, max_value, min_value,max_lenth,min_lenth,word_list):
180
181 df = pd.DataFrame(pd.read_excel('C:\\Users\Administrator\Desktop\探查模板.xlsx')) # 读取原数据
182 df_rows = df.shape[0] # 获取行数
183 # 增加一条数据
184 df.loc[df_rows] = [table_name, table_comment, column_name, column_comment, column_type, all_num, null_num,null_rate, only_num, only_rate, max_value, min_value, max_lenth, min_lenth, word_list] # 与原数据同格式
185 df.to_excel('C:\\Users\Administrator\Desktop\探查模板.xlsx', sheet_name='sheet1', index=False, header=True)
186
187
188
189
190 if __name__=='__main__':
191 run = Expedition("数据库名", "账户名", "密码", "ip地址", 端口号)
192 run.get_excel_value()
193 run.get_table_info()