小命令

from io import BytesIO
import requests
import pdfplumber
import re
import boto3
from boto3.dynamodb.conditions import Key, Attr
import json

def lambda_handler(event, context):
    g=""
    ww=[]
    req = requests.get("http://www.cninfo.com.cn/new/index")
    pattern1 = re.compile(r'(?<=announcementId=)\d+') 
    pattern2 = re.compile(r'(?<=announcementTime=)\d+-\d+-\d+')
    id1=pattern1.findall(req.text)
    id2=pattern2.findall(req.text)
    for i,(xx,yy) in enumerate(zip(id2[10:],id1[10:]),start=1):
        g="http://static.cninfo.com.cn/finalpage/"+xx+"/"+yy+".PDF"
        try:
            req = requests.get(g)
            with pdfplumber.open(BytesIO(req.content)) as pdf:
                a=""
                for page in pdf.pages:
                    text = page.extract_text()
                    a=a+text
            a=re.sub(r"\n", '', a)
            a=re.sub(r" ", '', a)
            aa = {"id": i, "name": a}
            ww.append(aa) 
        except Exception as e:
           pass
        continue
    s3 = boto3.client('s3')
    s3.put_object(Body=json.dumps(ww,ensure_ascii=False),
    Bucket='fenci',
    Key='aa4.json')
    return {
        'statusCode': "hello",
    }

 

解决乱码

df.to_csv("/xx.csv", encoding="utf_8_sig")

 

import json
import boto3
BUCKET = 'fenci'
def lambda_handler(event, context):
    # TODO implement
    test_dict = get_json_from_s3(key="aa4.json")
        
    return test_dict
def get_json_from_s3(key: str):
    """
    Retrieves the json file containing responses from s3. returns a dict

    Args:
        key (str): file path to the json file

    Returns:
        dict: json style dict
    """
    OBJ_S3 = boto3.resource('s3')
    obj = OBJ_S3.Object(BUCKET, key)
    json_text_bytes = obj.get()["Body"].read().decode("utf-8")
    json_text = json.loads(json_text_bytes)
    return json_text

 

posted @ 2022-07-21 03:07  速搞  阅读(26)  评论(0)    收藏  举报