【439】Tweets processing by Python
参数说明:
- coordinates:Represents the geographic location of this Tweet as reported by the user or client application. The inner coordinates array is formatted as geoJSON (longitude first, then latitude).
1.文本文件转 json 格式
读取 txt 文件中的 tweets 文本,将其转为 json 格式,可以打印输出,也可以提取详细信息
代码:
import json
import os
folderpath = r"D:\Twitter Data\Data\test"
files = os.listdir(folderpath)
os.chdir(folderpath)
# get the first txt file
tweets_data_path = files[0]
# store json format file in this array
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
try:
tweet = json.loads(line)
tweets_data.append(tweet)
except:
continue
# print json format file with indentation
print(json.dumps(tweets_data[0], indent=4))
输出:
{
"created_at": "Tue Jun 25 20:44:34 +0000 2019",
"id": 1143621025550049280,
"id_str": "1143621025550049280",
"text": "Australia beat the Poms overnight \ud83d\ude01\ud83c\udfcf\ud83c\udde6\ud83c\uddfa\ud83c\udff4\udb40\udc67\udb40\udc62\udb40\udc65\udb40\udc6e\udb40\udc67\udb40\udc7f #AUSvENG #CmonAussie #CWC19",
"source": "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>",
"truncated": false,
"in_reply_to_status_id": null,
"in_reply_to_status_id_str": null,
"in_reply_to_user_id": null,
"in_reply_to_user_id_str": null,
"in_reply_to_screen_name": null,
"user": {
"id": 252426781,
"id_str": "252426781",
"name": "Willy Aitch",
"screen_name": "WillyAitch",
"location": "Melbourne, Victoria",
"url": null,
"description": "September 2017 to February 2018, was the greatest 5 months ever. Richmond \ud83d\udc2f\ud83d\udc2f\ud83d\udc2fwon the 2017 AFL Premiership! Philadelphia Eagles \ud83e\udd85\ud83e\udd85\ud83e\udd85 won Super Bowl LII",
"translator_type": "none",
"protected": false,
"verified": false,
"followers_count": 417,
"friends_count": 1061,
"listed_count": 15,
"favourites_count": 18852,
"statuses_count": 17796,
"created_at": "Tue Feb 15 04:55:59 +0000 2011",
"utc_offset": null,
"time_zone": null,
"geo_enabled": true,
"lang": null,
"contributors_enabled": false,
"is_translator": false,
"profile_background_color": "C0DEED",
"profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png",
"profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png",
"profile_background_tile": false,
"profile_link_color": "1DA1F2",
"profile_sidebar_border_color": "C0DEED",
"profile_sidebar_fill_color": "DDEEF6",
"profile_text_color": "333333",
"profile_use_background_image": true,
"profile_image_url": "http://pbs.twimg.com/profile_images/1112669591342211072/rnbV0dCK_normal.jpg",
"profile_image_url_https": "https://pbs.twimg.com/profile_images/1112669591342211072/rnbV0dCK_normal.jpg",
"profile_banner_url": "https://pbs.twimg.com/profile_banners/252426781/1522377977",
"default_profile": true,
"default_profile_image": false,
"following": null,
"follow_request_sent": null,
"notifications": null
},
"geo": null,
"coordinates": null,
"place": {
"id": "01864a8a64df9dc4",
"url": "https://api.twitter.com/1.1/geo/id/01864a8a64df9dc4.json",
"place_type": "city",
"name": "Melbourne",
"full_name": "Melbourne, Victoria",
"country_code": "AU",
"country": "Australia",
"bounding_box": {
"type": "Polygon",
"coordinates": [
[
[
144.593742,
-38.433859
],
[
144.593742,
-37.511274
],
[
145.512529,
-37.511274
],
[
145.512529,
-38.433859
]
]
]
},
"attributes": {}
},
"contributors": null,
"is_quote_status": false,
"quote_count": 0,
"reply_count": 0,
"retweet_count": 0,
"favorite_count": 0,
"entities": {
"hashtags": [
{
"text": "AUSvENG",
"indices": [
46,
54
]
},
{
"text": "CmonAussie",
"indices": [
55,
66
]
},
{
"text": "CWC19",
"indices": [
67,
73
]
}
],
"urls": [],
"user_mentions": [],
"symbols": []
},
"favorited": false,
"retweeted": false,
"filter_level": "low",
"lang": "en",
"timestamp_ms": "1561495474599"
}
2. 读取关键字内容
通过 .keys() 获取所有的键值
代码:
import json
import os
folderpath = r"D:\Twitter Data\Data\test"
files = os.listdir(folderpath)
os.chdir(folderpath)
# get the first txt file
tweets_data_path = files[0]
# store json format file in this array
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
try:
tweet = json.loads(line)
tweets_data.append(tweet)
except:
continue
for k in tweets_data[0].keys():
print(k)
输出:
created_at id id_str text source truncated in_reply_to_status_id in_reply_to_status_id_str in_reply_to_user_id in_reply_to_user_id_str in_reply_to_screen_name user geo coordinates place contributors is_quote_status quote_count reply_count retweet_count favorite_count entities favorited retweeted filter_level lang timestamp_ms
3. 输出键值信息
代码:
import json
import os
folderpath = r"D:\Twitter Data\Data\test"
files = os.listdir(folderpath)
os.chdir(folderpath)
# get the first txt file
tweets_data_path = files[0]
# store json format file in this array
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
try:
tweet = json.loads(line)
tweets_data.append(tweet)
except:
continue
for k in tweets_data[0].keys():
print(k, ":", tweets_data[0][k])
print()
输出:
created_at : Tue Jun 25 20:44:34 +0000 2019
id : 1143621025550049280
id_str : 1143621025550049280
text : Australia beat the Poms overnight 😁🏏🇦🇺🏴 #AUSvENG #CmonAussie #CWC19
source : <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>
truncated : False
in_reply_to_status_id : None
in_reply_to_status_id_str : None
in_reply_to_user_id : None
in_reply_to_user_id_str : None
in_reply_to_screen_name : None
user : {'id': 252426781, 'id_str': '252426781', 'name': 'Willy Aitch', 'screen_name': 'WillyAitch', 'location': 'Melbourne, Victoria', 'url': None, 'description': 'September 2017 to February 2018, was the greatest 5 months ever. Richmond 🐯🐯🐯won the 2017 AFL Premiership! Philadelphia Eagles 🦅🦅🦅 won Super Bowl LII', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 417, 'friends_count': 1061, 'listed_count': 15, 'favourites_count': 18852, 'statuses_count': 17796, 'created_at': 'Tue Feb 15 04:55:59 +0000 2011', 'utc_offset': None, 'time_zone': None, 'geo_enabled': True, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'profile_background_color': 'C0DEED', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_link_color': '1DA1F2', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1112669591342211072/rnbV0dCK_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1112669591342211072/rnbV0dCK_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/252426781/1522377977', 'default_profile': True, 'default_profile_image': False, 'following': None, 'follow_request_sent': None, 'notifications': None}
geo : None
coordinates : None
place : {'id': '01864a8a64df9dc4', 'url': 'https://api.twitter.com/1.1/geo/id/01864a8a64df9dc4.json', 'place_type': 'city', 'name': 'Melbourne', 'full_name': 'Melbourne, Victoria', 'country_code': 'AU', 'country': 'Australia', 'bounding_box': {'type': 'Polygon', 'coordinates': [[[144.593742, -38.433859], [144.593742, -37.511274], [145.512529, -37.511274], [145.512529, -38.433859]]]}, 'attributes': {}}
contributors : None
is_quote_status : False
quote_count : 0
reply_count : 0
retweet_count : 0
favorite_count : 0
entities : {'hashtags': [{'text': 'AUSvENG', 'indices': [46, 54]}, {'text': 'CmonAussie', 'indices': [55, 66]}, {'text': 'CWC19', 'indices': [67, 73]}], 'urls': [], 'user_mentions': [], 'symbols': []}
favorited : False
retweeted : False
filter_level : low
lang : en
timestamp_ms : 1561495474599
4. 输出二级键值,与2,3类似
代码:
import json
import os
folderpath = r"D:\Twitter Data\Data\test"
files = os.listdir(folderpath)
os.chdir(folderpath)
# get the first txt file
tweets_data_path = files[0]
# store json format file in this array
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
try:
tweet = json.loads(line)
tweets_data.append(tweet)
except:
continue
for k2 in tweets_data[0]["user"]:
print(k2,":",tweets_data[0]["user"][k2])
输出:
id : 252426781 id_str : 252426781 name : Willy Aitch screen_name : WillyAitch location : Melbourne, Victoria url : None description : September 2017 to February 2018, was the greatest 5 months ever. Richmond 🐯🐯🐯won the 2017 AFL Premiership! Philadelphia Eagles 🦅🦅🦅 won Super Bowl LII translator_type : none protected : False verified : False followers_count : 417 friends_count : 1061 listed_count : 15 favourites_count : 18852 statuses_count : 17796 created_at : Tue Feb 15 04:55:59 +0000 2011 utc_offset : None time_zone : None geo_enabled : True lang : None contributors_enabled : False is_translator : False profile_background_color : C0DEED profile_background_image_url : http://abs.twimg.com/images/themes/theme1/bg.png profile_background_image_url_https : https://abs.twimg.com/images/themes/theme1/bg.png profile_background_tile : False profile_link_color : 1DA1F2 profile_sidebar_border_color : C0DEED profile_sidebar_fill_color : DDEEF6 profile_text_color : 333333 profile_use_background_image : True profile_image_url : http://pbs.twimg.com/profile_images/1112669591342211072/rnbV0dCK_normal.jpg profile_image_url_https : https://pbs.twimg.com/profile_images/1112669591342211072/rnbV0dCK_normal.jpg profile_banner_url : https://pbs.twimg.com/profile_banners/252426781/1522377977 default_profile : True default_profile_image : False following : None follow_request_sent : None notifications : None
5. Tweets to csv and reading csv
import json
import os
import codecs
folderpath = r"D:\Twitter Data\Data"
files = os.listdir(folderpath)
os.chdir(folderpath)
fo = open(r"D:\Twitter Data\Data\test\tweets.csv", "w")
fo.write("\ufeff")
fo.write("id,created_at,coordinates,co_lon,co_lat,geo,geo_lat,geo_lon," +
"user_location,place_type,place_name," +
"place_full_name,place_country,place_bounding_box,pb_avg_lon,pb_avg_lat," +
"lang,source,text")
count = 0
for file in files:
# determine is file or directory
if os.path.isdir(file):
continue
count += 1
print(count, ":", file)
#if count < 100:
# continue
tweets_file = open(file, "r")
for line in tweets_file:
try:
#count += 1
#if (count < 53850):
# continue
tweet = json.loads(line)
csv_text = "\n"
# id
csv_text += tweet["id_str"]
csv_text += ","
# created_at
csv_text += str(tweet["created_at"])
csv_text += ","
# coordinates
if (tweet["coordinates"]):
csv_text += "Yes,"
csv_text += str(tweet["coordinates"]["coordinates"][0])
csv_text += ","
csv_text += str(tweet["coordinates"]["coordinates"][1])
else:
csv_text += "None,None,None"
csv_text += ","
# geo
if (tweet["geo"]):
csv_text += "Yes,"
csv_text += str(tweet["geo"]["coordinates"][0])
csv_text += ","
csv_text += str(tweet["geo"]["coordinates"][1])
else:
csv_text += "None,None,None"
csv_text += ","
# user->location
ul = str(tweet["user"]["location"])
ul = ul.replace("\n", " ")
ul = ul.replace("\"", "")
ul = ul.replace("\'", "")
csv_text += "\"" + ul + "\""
csv_text += ","
# place->type
csv_text += str(tweet["place"]["place_type"])
csv_text += ","
# place->name
csv_text += "\"" + str(tweet["place"]["name"]) + "\""
csv_text += ","
# place->full_name
csv_text += "\"" + str(tweet["place"]["full_name"]) + "\""
csv_text += ","
# place->country
csv_text += "\"" + str(tweet["place"]["country"]) + "\""
csv_text += ","
# place->bounding_box
if (tweet["place"]["bounding_box"]["coordinates"]):
# min_lon
min_lon = tweet["place"]["bounding_box"]["coordinates"][0][0][0]
# min_lat
min_lat = tweet["place"]["bounding_box"]["coordinates"][0][0][1]
# max_lon
max_lon = tweet["place"]["bounding_box"]["coordinates"][0][2][0]
# max_lat
max_lat = tweet["place"]["bounding_box"]["coordinates"][0][2][1]
# avg of lon and lat
lon = (min_lon + max_lon)/2
lat = (min_lat + max_lat)/2
csv_text += "Yes,"
csv_text += str(lon)
csv_text += ","
csv_text += str(lat)
else:
csv_text += "None, None, None"
csv_text += ","
# lang
csv_text += str(tweet["lang"])
csv_text += ","
# source
csv_text += "\"" + str(tweet["source"]) + "\""
csv_text += ","
# text
# replace carriage return, double quotation marks, single quotation marks with space or nothing
text = str(tweet["text"])
text = text.replace("\r", " ")
text = text.replace("\n", " ")
text = text.replace("\"", "")
text = text.replace("\'", "")
csv_text += "\"" + text + "\""
fo.write(csv_text)
#if (count > 53851):
# break
except:
continue
#if count > 150:
# break
fo.close()
import pandas as pd
df = pd.read_csv(open(r"D:\Twitter Data\Data\test\tweets.csv", encoding='utf-8',errors='ignore'))
df.head()
浙公网安备 33010602011771号