import os
import numpy as np
import pandas as pd
import string, random
def random_string(n: int):
return ''.join(random.choices(string.ascii_letters, weights=(1,) * len(string.ascii_letters), k=n))
rows = 3
# df = pd.DataFrame(np.array([range(5), range(5)]))
for i in range(4):
data = {'id': range(rows), 'name': (random_string(5) for _ in range(rows)), 'age': (random.randrange(10, 99) for _ in range(rows)),
'score': (round(random.uniform(0, 100), 2) for _ in range(rows)), 'group': i}
# df = pd.DataFrame(data=data, index=range(rows))
df = pd.DataFrame(data=data, index=tuple(''.join(chr(o) for o in range(ord('a'), ord('a') + rows))))
print(df)
df.to_parquet(path=f'mock-id-name-age-score-{i}.parquet', engine='pyarrow', compression='snappy', index=True)
os.system(f'hadoop fs -rm -f /user/b_aip/zliu3/parquets/mock-id-name-age-score-{i}.parquet')
os.system(f'hadoop fs -moveFromLocal mock-id-name-age-score-{i}.parquet /user/b_aip/zliu3/parquets')