import requests
from bs4 import BeautifulSoup
from pygtrans import Translate
def multi_requests(url, headers=None, verify=False, proxies=None, timeout=120):
res = None
for _ in range(5):
try:
res = requests.get(url, headers=headers, verify=verify, proxies=proxies, timeout=timeout)
if res.status_code == 200:
break
except Exception as err:
time.sleep(5)
return res
def md5_hash(content):
md5 = hashlib.md5()
if isinstance(content, str):
content = content.encode('utf-8')
md5.update(content)
return md5.hexdigest()
def new_request(url: list):
(MD_url, IVD_url, IVD2_url) = (i for i in url)
bytes_datas = get_bytesdata((MD_url, IVD_url, IVD2_url))
df_list = parse_excel_bytes_data(bytes_datas)
for df in df_list:
df.columns = df.columns.str.replace('_', ' ')
df_all = pd.concat(df_list, axis=0, ignore_index=True).dropna(axis=1, how='all').fillna('')
df_all['create_date'] = '2023-03-02'
group = [
'EXPEDIENTE',
'PRODUCTO',
'REGISTRO SANITARIO',
'ESTADO REGISTRO',
'FECHA EXPEDICION',
'FECHA VENCIMIENTO',
'MODALIDAD',
'GRUPO',
'NIVEL RIESGO',
'USOS',
'VIDA UTIL',
'TITULAR',
'PAIS TITULAR',
'DEPARTAMENTO TITULAR',
'CIUDAD TITULAR',
'ivd md type',
'create_date'
]
# 多行转多列
df_all.drop_duplicates(group + ['ROL'], inplace=True)
reset_df = df_all.pivot(index=group, columns=['ROL'], values=['NOMBRE ROL', 'PAIS ROL'])
df_reset = reset_df.reset_index().fillna('')
# 翻译国家名
df_reset.columns = [''.join(col) for col in df_reset.columns.values]
# country_column = [column for column in self.df_reset.columns if 'PAIS' in column]
need_tran = ['PAIS TITULAR', 'PAIS ROLACONDICIONADOR', 'PAIS ROLALMACENADOR', 'PAIS ROLAPODERADO',
'PAIS ROLEMPACADOR', 'PAIS ROLENTIDAD RESPONSABLE SISMED', 'PAIS ROLENVASADOR',
'PAIS ROLEXPORTADOR', 'PAIS ROLFABRICANTE', 'PAIS ROLIMPORTADOR', 'PAIS ROLMAQUILADOR',
'PAIS ROLREPRESENTANTE LEGAL', 'PAIS ROLRESPONSABLE']
for col in need_tran:
df_reset[f'{col}_t'] = country_translate(df_reset[col])
date_column = ['FECHA EXPEDICION', 'FECHA VENCIMIENTO']
for column in date_column:
df_reset[column] = parser_date(df_reset[column])
df_reset['ID'] = df_reset.apply(lambda x: md5_hash((x['PRODUCTO'] + x['TITULAR']).encode()),
axis=1)
return df_reset
def parser_date(data: pd.Series):
dict_ = {}
for date_value in data.unique():
try:
dict_.update({date_value: pd.to_datetime(date_value, infer_datetime_format=True, format=None,
errors='coerce').strftime("%Y-%m-%d")})
except:
pass
return data.apply(lambda x: dict_.get(x, ''))
def parse_excel_bytes_data(bytes_data):
"""
bytes_data : 二进制文件们
"""
df_excel = [pd.read_excel(io.BytesIO(bytes), engine='openpyxl') for bytes in bytes_data]
df_excel[0]['ivd_md_type'] = 'MD'
df_excel[1]['ivd_md_type'] = 'IVD'
df_excel[2]['ivd_md_type'] = 'IVD'
return df_excel
def get_bytesdata(urls):
"""
urls : excel的url们
"""
bytes_data = tuple(multi_requests(url, verify=False).content for url in urls)
return bytes_data
def country_translate(data: pd.Series):
"""
获取国家翻译映射字典
column:需要翻译的column列表
"""
client = Translate()
dict_ = {}
for i in data.unique():
try:
text = client.translate(i, target='en').translatedText.strip()
dict_.update({i: text})
except:
continue
return data.apply(lambda x: dict_.get(x, ''))
frist_res = multi_requests(url="https://www.invima.gov.co/base-de-datos-ddmyot")
soup = BeautifulSoup(frist_res.content, 'html.parser')
url_list = [a['href'] for a in soup.find('div', id=re.compile('\d?-etabs-ecollapse-2-1')).find_all(['a'])]
url_list[0] = "https://drive.google.com/uc?export=download&id=" + re.findall(r"/d/(.*?)/edit", url_list[0])[0]
content = new_request(url_list)