Python 规范化LinkedIn用户联系人的职位名
CODE:
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
Created on 2014-8-19
@author: guaguastd
@name: job_title_standard.py
'''
import os
import csv
from collections import Counter
from operator import itemgetter
from prettytable import PrettyTable
# specify csv directory
CSV_FILE = os.path.join(r"E:", "\\", "eclipse", "LinkedIn", "dfile", "my_connections.csv")
# define a set of transforms that converts the first item
# to the second item
transforms = [
('Sr.', 'Senior'),
('Sr', 'Senior'),
('Jr.', 'Junior'),
('Jr', 'Junior'),
('CEO', 'Chief Executive Officer'),
('COO', 'Chief Operating Officer'),
('CTO', 'Chief Technology Officer'),
('CFO', 'Chief Finance Officer'),
('VP', 'Vice President'),
]
csvReader = csv.DictReader(open(CSV_FILE), delimiter=',', quotechar='"')
contacts = [row for row in csvReader]
# Read in a list of titles and split
# apart any combined titles like "President/CEO."
# "President & CEO", "President and CEO"
titles = []
for contact in contacts:
titles.extend([t.strip() for t in contact['Job Title'].split('/')
if contact['Job Title'].strip() != ''])
# Replace common/known abbreviations
for i, _ in enumerate(titles):
for transform in transforms:
titles[i] = titles[i].replace(*transform)
# Print out a table of titles sorted by frequency
pt = PrettyTable(field_names=['Title', 'Freq'])
pt.align = 'l'
c = Counter(titles)
[pt.add_row([title, freq])
for (title, freq) in sorted(c.items(), key=itemgetter(1), reverse=True)
if freq > 0]
print pt
# Print out a table of tokens sorted by frequency
tokens = []
for title in titles:
tokens.extend([t.strip(',') for t in title.split()])
pt = PrettyTable(field_names=['Token', 'Freq'])
pt.align = 'l'
c = Counter(tokens)
[pt.add_row([token, freq])
for (token, freq) in sorted(c.items(), key=itemgetter(1), reverse=True)
if freq > 0 and len(token) > 2]
print ptRESULT:
+-----------------------------------+------+ | Title | Freq | +-----------------------------------+------+ | Senior Software Developer | 1 | | Sales Manager | 1 | | Software Manager | 1 | | Online Marketing Manager | 1 | | Senior Consultant | 1 | | Chief Executive Officer & Founder | 1 | | Director | 1 | | S | 1 | | Student | 1 | | Senior Software Engineer | 1 | | ???| 1 | +-----------------------------------+------+ +------------+------+ | Token | Freq | +------------+------+ | Manager | 3 | | Senior | 3 | | Software | 3 | | Marketing | 1 | | Founder | 1 | | Consultant | 1 | | Executive | 1 | | Sales | 1 | | Developer | 1 | | Director | 1 | | Chief | 1 | | Officer | 1 | | Student | 1 | | Online | 1 | | ???
| 1 | | Engineer | 1 | +------------+------+

浙公网安备 33010602011771号