1 # %% NIPS 2020 论文信息下载
2 import json
3 import os
4 import re
5
6 import pandas as pd
7 import requests
8 import tqdm
9 from bs4 import BeautifulSoup
10
11
12 os.chdir(os.path.dirname(os.path.abspath(__file__)))
13
14 # %%
15 PAPER_HASH_PATTERN = re.compile(r'poster_(?P<UID>\w+)\.html')
16 SESSION_PATTERN = re.compile(r'Orals & Spotlights Track \d+:\s*(?P<session>[^;]*)')
17
18
19 def cleanup_string(s):
20 s = s.strip()
21 while ' ' in s:
22 s = s.replace(' ', ' ')
23 return s
24
25
26 def download_file(download_url, file_name=None):
27 if file_name is None:
28 file_name = os.path.basename(download_url)
29 response = requests.get(download_url, stream=True)
30 total = int(response.headers.get('Content-Length'))
31 pbar = None
32 if total is not None:
33 pbar = tqdm.tqdm(desc=f'Downloading from {download_url} to {file_name}',
34 total=total, unit='B', unit_scale=True, unit_divisor=1000)
35 with open(file_name, 'wb') as file:
36 for chunk in response.iter_content(chunk_size=10240):
37 if chunk:
38 file.write(chunk)
39 if pbar is not None:
40 pbar.update(len(chunk))
41
42
43 # %%
44 # download paper list
45 if not os.path.exists('papers.json'):
46 download_file('https://neurips.cc/virtual/2020/public/papers.json', file_name='papers.json')
47
48 # %%
49 # get oral paper list
50 oral_papers = set()
51 response = requests.get('https://neurips.cc/virtual/2020/public/f_orals.html')
52 soup = BeautifulSoup(response.text, 'html.parser')
53 for tag in soup.find_all('a', href=PAPER_HASH_PATTERN):
54 href = tag['href']
55 UID = PAPER_HASH_PATTERN.search(href).group('UID')
56 oral_papers.add(UID)
57
58 # %%
59 # process paper list
60 with open('papers.json', mode='r') as file:
61 data = json.load(file)
62
63 df = pd.DataFrame(columns=['ID', 'Category', 'Title', 'Authors', 'Keywords', 'Sessions', 'URL', 'Proceedings URL', 'PDF URL', 'UID'])
64 for i, paper in enumerate(tqdm.tqdm(data)):
65 if paper['eventtype'] != 'Poster':
66 continue
67
68 UID = paper['UID']
69 category = 'Poster'
70 sessions = '; '.join(paper['sessions'])
71 sessions = '; '.join([match.group('session') for match in SESSION_PATTERN.finditer(sessions)])
72 sessions = cleanup_string(sessions)
73 if sessions != '':
74 category = 'Spotlight'
75 if UID in oral_papers:
76 category = 'Oral'
77
78 keywords = set()
79 for keyword in ('; '.join(paper['keywords'])).split('; '):
80 keyword = cleanup_string(keyword)
81 if keyword != '':
82 keywords.add(keyword)
83 keywords = '\n'.join(sorted(keywords))
84
85 paper = {
86 'ID': paper['id'],
87 'Category': category,
88 'Title': cleanup_string(paper['title']),
89 'Authors': cleanup_string(', '.join(paper['authors'])),
90 'Keywords': keywords,
91 'Sessions': sessions,
92 'URL': f'https://neurips.cc/virtual/2020/public/poster_{UID}.html',
93 'Proceedings URL': paper['paper_pdf_url'],
94 'PDF URL': f'https://proceedings.neurips.cc/paper/2020/file/{UID}-Paper.pdf',
95 'UID': UID
96 }
97 df.loc[len(df)] = paper
98
99 df['Category'] = pd.Categorical(df['Category'], categories=['Oral', 'Spotlight', 'Poster'])
100 df.sort_values(by=['Category', 'Sessions', 'Keywords'], inplace=True)
101 df.to_csv('paper_list.csv', index=False)
102
103 # %%
104 # get paper details
105 all_subject_areas = set()
106 for i, paper in enumerate(tqdm.tqdm(df.iloc, total=len(df))):
107 if paper['Keywords'] == '':
108 continue
109 areas = set(paper['Keywords'].split('\n'))
110 all_subject_areas.update(areas)
111
112 try:
113 all_subject_areas.remove('')
114 except KeyError:
115 pass
116
117 df = df.reindex(columns=df.columns.to_list() + sorted(all_subject_areas))
118 for i, paper in enumerate(df.iloc):
119 for area in paper['Keywords'].split('\n'):
120 if area != '':
121 df[area][i] = 'Y'
122
123 df.to_csv('NeuraIPS Papers.csv', index=False)