alex_bn_lee

导航

【439】Tweets processing by Python

参数说明:

  • coordinates:Represents the geographic location of this Tweet as reported by the user or client application. The inner coordinates array is formatted as geoJSON (longitude first, then latitude).

1.文本文件转 json 格式

  读取 txt 文件中的 tweets 文本,将其转为 json 格式,可以打印输出,也可以提取详细信息

代码:

import json
import os

folderpath = r"D:\Twitter Data\Data\test"
files = os.listdir(folderpath)
os.chdir(folderpath)

# get the first txt file
tweets_data_path = files[0]

# store json format file in this array
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except:
        continue
# print json format file with indentation
print(json.dumps(tweets_data[0], indent=4))

输出:

{
    "created_at": "Tue Jun 25 20:44:34 +0000 2019",
    "id": 1143621025550049280,
    "id_str": "1143621025550049280",
    "text": "Australia beat the Poms overnight \ud83d\ude01\ud83c\udfcf\ud83c\udde6\ud83c\uddfa\ud83c\udff4\udb40\udc67\udb40\udc62\udb40\udc65\udb40\udc6e\udb40\udc67\udb40\udc7f #AUSvENG #CmonAussie #CWC19",
    "source": "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>",
    "truncated": false,
    "in_reply_to_status_id": null,
    "in_reply_to_status_id_str": null,
    "in_reply_to_user_id": null,
    "in_reply_to_user_id_str": null,
    "in_reply_to_screen_name": null,
    "user": {
        "id": 252426781,
        "id_str": "252426781",
        "name": "Willy Aitch",
        "screen_name": "WillyAitch",
        "location": "Melbourne, Victoria",
        "url": null,
        "description": "September 2017 to February 2018, was the greatest 5 months ever. Richmond \ud83d\udc2f\ud83d\udc2f\ud83d\udc2fwon the 2017 AFL Premiership! Philadelphia Eagles \ud83e\udd85\ud83e\udd85\ud83e\udd85 won Super Bowl LII",
        "translator_type": "none",
        "protected": false,
        "verified": false,
        "followers_count": 417,
        "friends_count": 1061,
        "listed_count": 15,
        "favourites_count": 18852,
        "statuses_count": 17796,
        "created_at": "Tue Feb 15 04:55:59 +0000 2011",
        "utc_offset": null,
        "time_zone": null,
        "geo_enabled": true,
        "lang": null,
        "contributors_enabled": false,
        "is_translator": false,
        "profile_background_color": "C0DEED",
        "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png",
        "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png",
        "profile_background_tile": false,
        "profile_link_color": "1DA1F2",
        "profile_sidebar_border_color": "C0DEED",
        "profile_sidebar_fill_color": "DDEEF6",
        "profile_text_color": "333333",
        "profile_use_background_image": true,
        "profile_image_url": "http://pbs.twimg.com/profile_images/1112669591342211072/rnbV0dCK_normal.jpg",
        "profile_image_url_https": "https://pbs.twimg.com/profile_images/1112669591342211072/rnbV0dCK_normal.jpg",
        "profile_banner_url": "https://pbs.twimg.com/profile_banners/252426781/1522377977",
        "default_profile": true,
        "default_profile_image": false,
        "following": null,
        "follow_request_sent": null,
        "notifications": null
    },
    "geo": null,
    "coordinates": null,
    "place": {
        "id": "01864a8a64df9dc4",
        "url": "https://api.twitter.com/1.1/geo/id/01864a8a64df9dc4.json",
        "place_type": "city",
        "name": "Melbourne",
        "full_name": "Melbourne, Victoria",
        "country_code": "AU",
        "country": "Australia",
        "bounding_box": {
            "type": "Polygon",
            "coordinates": [
                [
                    [
                        144.593742,
                        -38.433859
                    ],
                    [
                        144.593742,
                        -37.511274
                    ],
                    [
                        145.512529,
                        -37.511274
                    ],
                    [
                        145.512529,
                        -38.433859
                    ]
                ]
            ]
        },
        "attributes": {}
    },
    "contributors": null,
    "is_quote_status": false,
    "quote_count": 0,
    "reply_count": 0,
    "retweet_count": 0,
    "favorite_count": 0,
    "entities": {
        "hashtags": [
            {
                "text": "AUSvENG",
                "indices": [
                    46,
                    54
                ]
            },
            {
                "text": "CmonAussie",
                "indices": [
                    55,
                    66
                ]
            },
            {
                "text": "CWC19",
                "indices": [
                    67,
                    73
                ]
            }
        ],
        "urls": [],
        "user_mentions": [],
        "symbols": []
    },
    "favorited": false,
    "retweeted": false,
    "filter_level": "low",
    "lang": "en",
    "timestamp_ms": "1561495474599"
}

 

2. 读取关键字内容

  通过 .keys() 获取所有的键值

代码:

import json
import os

folderpath = r"D:\Twitter Data\Data\test"
files = os.listdir(folderpath)
os.chdir(folderpath)

# get the first txt file
tweets_data_path = files[0]

# store json format file in this array
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except:
        continue
        
for k in tweets_data[0].keys():
    print(k)

 输出:

created_at
id
id_str
text
source
truncated
in_reply_to_status_id
in_reply_to_status_id_str
in_reply_to_user_id
in_reply_to_user_id_str
in_reply_to_screen_name
user
geo
coordinates
place
contributors
is_quote_status
quote_count
reply_count
retweet_count
favorite_count
entities
favorited
retweeted
filter_level
lang
timestamp_ms

 

3. 输出键值信息

 代码:

import json
import os

folderpath = r"D:\Twitter Data\Data\test"
files = os.listdir(folderpath)
os.chdir(folderpath)

# get the first txt file
tweets_data_path = files[0]

# store json format file in this array
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except:
        continue
        
for k in tweets_data[0].keys():
    print(k, ":", tweets_data[0][k])
    print()

 输出:

created_at : Tue Jun 25 20:44:34 +0000 2019

id : 1143621025550049280

id_str : 1143621025550049280

text : Australia beat the Poms overnight 😁🏏🇦🇺🏴󠁧󠁢󠁥󠁮󠁧󠁿 #AUSvENG #CmonAussie #CWC19

source : <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>

truncated : False

in_reply_to_status_id : None

in_reply_to_status_id_str : None

in_reply_to_user_id : None

in_reply_to_user_id_str : None

in_reply_to_screen_name : None

user : {'id': 252426781, 'id_str': '252426781', 'name': 'Willy Aitch', 'screen_name': 'WillyAitch', 'location': 'Melbourne, Victoria', 'url': None, 'description': 'September 2017 to February 2018, was the greatest 5 months ever. Richmond 🐯🐯🐯won the 2017 AFL Premiership! Philadelphia Eagles 🦅🦅🦅 won Super Bowl LII', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 417, 'friends_count': 1061, 'listed_count': 15, 'favourites_count': 18852, 'statuses_count': 17796, 'created_at': 'Tue Feb 15 04:55:59 +0000 2011', 'utc_offset': None, 'time_zone': None, 'geo_enabled': True, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'profile_background_color': 'C0DEED', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_link_color': '1DA1F2', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1112669591342211072/rnbV0dCK_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1112669591342211072/rnbV0dCK_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/252426781/1522377977', 'default_profile': True, 'default_profile_image': False, 'following': None, 'follow_request_sent': None, 'notifications': None}

geo : None

coordinates : None

place : {'id': '01864a8a64df9dc4', 'url': 'https://api.twitter.com/1.1/geo/id/01864a8a64df9dc4.json', 'place_type': 'city', 'name': 'Melbourne', 'full_name': 'Melbourne, Victoria', 'country_code': 'AU', 'country': 'Australia', 'bounding_box': {'type': 'Polygon', 'coordinates': [[[144.593742, -38.433859], [144.593742, -37.511274], [145.512529, -37.511274], [145.512529, -38.433859]]]}, 'attributes': {}}

contributors : None

is_quote_status : False

quote_count : 0

reply_count : 0

retweet_count : 0

favorite_count : 0

entities : {'hashtags': [{'text': 'AUSvENG', 'indices': [46, 54]}, {'text': 'CmonAussie', 'indices': [55, 66]}, {'text': 'CWC19', 'indices': [67, 73]}], 'urls': [], 'user_mentions': [], 'symbols': []}

favorited : False

retweeted : False

filter_level : low

lang : en

timestamp_ms : 1561495474599

 

4. 输出二级键值,与2,3类似

代码:

import json
import os

folderpath = r"D:\Twitter Data\Data\test"
files = os.listdir(folderpath)
os.chdir(folderpath)

# get the first txt file
tweets_data_path = files[0]

# store json format file in this array
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except:
        continue
        
for k2 in tweets_data[0]["user"]:
    print(k2,":",tweets_data[0]["user"][k2])

输出:

id : 252426781
id_str : 252426781
name : Willy Aitch
screen_name : WillyAitch
location : Melbourne, Victoria
url : None
description : September 2017 to February 2018, was the greatest 5 months ever. Richmond 🐯🐯🐯won the 2017 AFL Premiership! Philadelphia Eagles 🦅🦅🦅 won Super Bowl LII
translator_type : none
protected : False
verified : False
followers_count : 417
friends_count : 1061
listed_count : 15
favourites_count : 18852
statuses_count : 17796
created_at : Tue Feb 15 04:55:59 +0000 2011
utc_offset : None
time_zone : None
geo_enabled : True
lang : None
contributors_enabled : False
is_translator : False
profile_background_color : C0DEED
profile_background_image_url : http://abs.twimg.com/images/themes/theme1/bg.png
profile_background_image_url_https : https://abs.twimg.com/images/themes/theme1/bg.png
profile_background_tile : False
profile_link_color : 1DA1F2
profile_sidebar_border_color : C0DEED
profile_sidebar_fill_color : DDEEF6
profile_text_color : 333333
profile_use_background_image : True
profile_image_url : http://pbs.twimg.com/profile_images/1112669591342211072/rnbV0dCK_normal.jpg
profile_image_url_https : https://pbs.twimg.com/profile_images/1112669591342211072/rnbV0dCK_normal.jpg
profile_banner_url : https://pbs.twimg.com/profile_banners/252426781/1522377977
default_profile : True
default_profile_image : False
following : None
follow_request_sent : None
notifications : None

 

5. Tweets to csv and reading csv

import json
import os
import codecs

folderpath = r"D:\Twitter Data\Data"
files = os.listdir(folderpath)
os.chdir(folderpath)

fo = open(r"D:\Twitter Data\Data\test\tweets.csv", "w")
fo.write("\ufeff")
fo.write("id,created_at,coordinates,co_lon,co_lat,geo,geo_lat,geo_lon," + 
         "user_location,place_type,place_name," + 
         "place_full_name,place_country,place_bounding_box,pb_avg_lon,pb_avg_lat," + 
         "lang,source,text")
count = 0

for file in files:
    # determine is file or directory
    if os.path.isdir(file):
        continue
        
    count += 1
    print(count, ":", file)
    #if count < 100:
    #    continue
    
    tweets_file = open(file, "r")
    for line in tweets_file:
        try:
            #count += 1
            #if (count < 53850):
            #    continue
            tweet = json.loads(line)
            csv_text = "\n"
            # id
            csv_text += tweet["id_str"]
            csv_text += ","
            # created_at
            csv_text += str(tweet["created_at"])
            csv_text += ","
            # coordinates
            if (tweet["coordinates"]):
                csv_text += "Yes,"
                csv_text += str(tweet["coordinates"]["coordinates"][0])
                csv_text += ","
                csv_text += str(tweet["coordinates"]["coordinates"][1])
            else:
                csv_text += "None,None,None"
            csv_text += ","
            # geo
            if (tweet["geo"]):
                csv_text += "Yes,"
                csv_text += str(tweet["geo"]["coordinates"][0])
                csv_text += ","
                csv_text += str(tweet["geo"]["coordinates"][1])
            else:
                csv_text += "None,None,None"
            csv_text += ","
            # user->location
            ul = str(tweet["user"]["location"])
            ul = ul.replace("\n", " ")
            ul = ul.replace("\"", "")
            ul = ul.replace("\'", "")
            csv_text += "\"" + ul + "\""
            csv_text += ","
            # place->type
            csv_text += str(tweet["place"]["place_type"])
            csv_text += ","
            # place->name
            csv_text += "\"" + str(tweet["place"]["name"]) + "\""
            csv_text += ","
            # place->full_name
            csv_text += "\"" + str(tweet["place"]["full_name"]) + "\""
            csv_text += ","
            # place->country
            csv_text += "\"" + str(tweet["place"]["country"]) + "\""
            csv_text += ","
            # place->bounding_box
            if (tweet["place"]["bounding_box"]["coordinates"]):
                # min_lon
                min_lon = tweet["place"]["bounding_box"]["coordinates"][0][0][0]
                # min_lat
                min_lat = tweet["place"]["bounding_box"]["coordinates"][0][0][1]
                # max_lon
                max_lon = tweet["place"]["bounding_box"]["coordinates"][0][2][0]
                # max_lat
                max_lat = tweet["place"]["bounding_box"]["coordinates"][0][2][1]
                # avg of lon and lat
                lon = (min_lon + max_lon)/2
                lat = (min_lat + max_lat)/2
                csv_text += "Yes,"
                csv_text += str(lon)
                csv_text += ","
                csv_text += str(lat)
            else:
                csv_text += "None, None, None"
            csv_text += ","
            # lang
            csv_text += str(tweet["lang"])
            csv_text += ","
            # source
            csv_text += "\"" + str(tweet["source"]) + "\""
            csv_text += ","
            # text
            # replace carriage return, double quotation marks, single quotation marks with space or nothing
            text = str(tweet["text"])
            text = text.replace("\r", " ")
            text = text.replace("\n", " ")
            text = text.replace("\"", "")
            text = text.replace("\'", "")
            csv_text += "\"" + text + "\""
            fo.write(csv_text)
            #if (count > 53851):
            #    break
        except:
            continue
    
    #if count > 150:
    #    break
        
fo.close()   

import pandas as pd
df = pd.read_csv(open(r"D:\Twitter Data\Data\test\tweets.csv", encoding='utf-8',errors='ignore'))
df.head()

  

 

 

 

posted on 2019-09-24 08:56  McDelfino  阅读(265)  评论(0)    收藏  举报