小命令
from io import BytesIO import requests import pdfplumber import re import boto3 from boto3.dynamodb.conditions import Key, Attr import json def lambda_handler(event, context): g="" ww=[] req = requests.get("http://www.cninfo.com.cn/new/index") pattern1 = re.compile(r'(?<=announcementId=)\d+') pattern2 = re.compile(r'(?<=announcementTime=)\d+-\d+-\d+') id1=pattern1.findall(req.text) id2=pattern2.findall(req.text) for i,(xx,yy) in enumerate(zip(id2[10:],id1[10:]),start=1): g="http://static.cninfo.com.cn/finalpage/"+xx+"/"+yy+".PDF" try: req = requests.get(g) with pdfplumber.open(BytesIO(req.content)) as pdf: a="" for page in pdf.pages: text = page.extract_text() a=a+text a=re.sub(r"\n", '', a) a=re.sub(r" ", '', a) aa = {"id": i, "name": a} ww.append(aa) except Exception as e: pass continue s3 = boto3.client('s3') s3.put_object(Body=json.dumps(ww,ensure_ascii=False), Bucket='fenci', Key='aa4.json') return { 'statusCode': "hello", }
解决乱码
df.to_csv("/xx.csv", encoding="utf_8_sig")
import json import boto3 BUCKET = 'fenci' def lambda_handler(event, context): # TODO implement test_dict = get_json_from_s3(key="aa4.json") return test_dict def get_json_from_s3(key: str): """ Retrieves the json file containing responses from s3. returns a dict Args: key (str): file path to the json file Returns: dict: json style dict """ OBJ_S3 = boto3.resource('s3') obj = OBJ_S3.Object(BUCKET, key) json_text_bytes = obj.get()["Body"].read().decode("utf-8") json_text = json.loads(json_text_bytes) return json_text

浙公网安备 33010602011771号