python之HTMLParser解析HTML文档(转) python2.7
http://www.cnblogs.com/hester/p/5420605.html
HTMLParser是Python自带的模块,使用简单,能够很容易的实现HTML文件的分析。
本文主要简单讲一下HTMLParser的用法.
使用时需要定义一个从类HTMLParser继承的类,重定义函数:
-
handle_starttag( tag, attrs)
-
handle_startendtag( tag, attrs)
-
handle_endtag( tag)
- handle_data(data)
更多属性及方法请查看源代码:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
|
"""A parser for HTML and XHTML.""" # This file is based on sgmllib.py, but the API is slightly different. # XXX There should be a way to distinguish between PCDATA (parsed # character data -- the normal case), RCDATA (replaceable character # data -- only char and entity references and end tags are special) # and CDATA (character data -- only end tags are special). import markupbase import re # Regular expressions used for parsing interesting_normal = re. compile ( '[&<]' ) incomplete = re. compile ( '&[a-zA-Z#]' ) entityref = re. compile ( '&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]' ) charref = re. compile ( '&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]' ) starttagopen = re. compile ( '<[a-zA-Z]' ) piclose = re. compile ( '>' ) commentclose = re. compile (r '--\s*>' ) # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state # note: if you change tagfind/attrfind remember to update locatestarttagend too tagfind = re. compile ( '([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*' ) # this regex is currently unused, but left for backward compatibility tagfind_tolerant = re. compile ( '[a-zA-Z][^\t\n\r\f />\x00]*' ) attrfind = re. compile ( r '((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' r '(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*' ) locatestarttagend = re. compile (r """ <[a-zA-Z][^\t\n\r\f />\x00]* # tag name (?:[\s/]* # optional whitespace before attribute name (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name (?:\s*=+\s* # value indicator (?:'[^']*' # LITA-enclosed value |"[^"]*" # LIT-enclosed value |(?!['"])[^>\s]* # bare value ) )?(?:\s|/(?!>))* )* )? \s* # trailing whitespace """ , re.VERBOSE) endendtag = re. compile ( '>' ) # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between # </ and the tag name, so maybe this should be fixed endtagfind = re. compile ( '</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>' ) class HTMLParseError(Exception): """Exception raised for all parse errors.""" def __init__( self , msg, position = ( None , None )): assert msg self .msg = msg self .lineno = position[ 0 ] self .offset = position[ 1 ] def __str__( self ): result = self .msg if self .lineno is not None : result = result + ", at line %d" % self .lineno if self .offset is not None : result = result + ", column %d" % ( self .offset + 1 ) return result class HTMLParser(markupbase.ParserBase): """Find tags and other markup and call handler functions. Usage: p = HTMLParser() p.feed(data) ... p.close() Start tags are handled by calling self.handle_starttag() or self.handle_startendtag(); end tags by self.handle_endtag(). The data between tags is passed from the parser to the derived class by calling self.handle_data() with the data as argument (the data may be split up in arbitrary chunks). Entity references are passed by calling self.handle_entityref() with the entity reference as the argument. Numeric character references are passed to self.handle_charref() with the string containing the reference as the argument. """ CDATA_CONTENT_ELEMENTS = ( "script" , "style" ) def __init__( self ): """Initialize and reset this instance.""" self .reset() def reset( self ): """Reset this instance. Loses all unprocessed data.""" self .rawdata = '' self .lasttag = '???' self .interesting = interesting_normal self .cdata_elem = None markupbase.ParserBase.reset( self ) def feed( self , data): r """Feed data to the parser. Call this as often as you want, with as little or as much text as you want (may include '\n'). """ self .rawdata = self .rawdata + data self .goahead( 0 ) def close( self ): """Handle any buffered data.""" self .goahead( 1 ) def error( self , message): raise HTMLParseError(message, self .getpos()) __starttag_text = None def get_starttag_text( self ): """Return full source of start tag: '<...>'.""" return self .__starttag_text def set_cdata_mode( self , elem): self .cdata_elem = elem.lower() self .interesting = re. compile (r '</\s*%s\s*>' % self .cdata_elem, re.I) def clear_cdata_mode( self ): self .interesting = interesting_normal self .cdata_elem = None # Internal -- handle data as far as reasonable. May leave state # and data to be processed by a subsequent call. If 'end' is # true, force handling all data as if followed by EOF marker. def goahead( self , end): rawdata = self .rawdata i = 0 n = len (rawdata) while i < n: match = self .interesting.search(rawdata, i) # < or & if match: j = match.start() else : if self .cdata_elem: break j = n if i < j: self .handle_data(rawdata[i:j]) i = self .updatepos(i, j) if i = = n: break startswith = rawdata.startswith if startswith( '<' , i): if starttagopen.match(rawdata, i): # < + letter k = self .parse_starttag(i) elif startswith( "</" , i): k = self .parse_endtag(i) elif startswith( "<!--" , i): k = self .parse_comment(i) elif startswith( "<?" , i): k = self .parse_pi(i) elif startswith( "<!" , i): k = self .parse_html_declaration(i) elif (i + 1 ) < n: self .handle_data( "<" ) k = i + 1 else : break if k < 0 : if not end: break k = rawdata.find( '>' , i + 1 ) if k < 0 : k = rawdata.find( '<' , i + 1 ) if k < 0 : k = i + 1 else : k + = 1 self .handle_data(rawdata[i:k]) i = self .updatepos(i, k) elif startswith( "&#" , i): match = charref.match(rawdata, i) if match: name = match.group()[ 2 : - 1 ] self .handle_charref(name) k = match.end() if not startswith( ';' , k - 1 ): k = k - 1 i = self .updatepos(i, k) continue else : if ";" in rawdata[i:]: # bail by consuming '&#' self .handle_data(rawdata[i:i + 2 ]) i = self .updatepos(i, i + 2 ) break elif startswith( '&' , i): match = entityref.match(rawdata, i) if match: name = match.group( 1 ) self .handle_entityref(name) k = match.end() if not startswith( ';' , k - 1 ): k = k - 1 i = self .updatepos(i, k) continue match = incomplete.match(rawdata, i) if match: # match.group() will contain at least 2 chars if end and match.group() = = rawdata[i:]: self .error( "EOF in middle of entity or char ref" ) # incomplete break elif (i + 1 ) < n: # not the end of the buffer, and can't be confused # with some other construct self .handle_data( "&" ) i = self .updatepos(i, i + 1 ) else : break else : assert 0 , "interesting.search() lied" # end while if end and i < n and not self .cdata_elem: self .handle_data(rawdata[i:n]) i = self .updatepos(i, n) self .rawdata = rawdata[i:] # Internal -- parse html declarations, return length or -1 if not terminated # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state # See also parse_declaration in _markupbase def parse_html_declaration( self , i): rawdata = self .rawdata if rawdata[i:i + 2 ] ! = '<!' : self .error( 'unexpected call to parse_html_declaration()' ) if rawdata[i:i + 4 ] = = '<!--' : # this case is actually already handled in goahead() return self .parse_comment(i) elif rawdata[i:i + 3 ] = = '<![' : return self .parse_marked_section(i) elif rawdata[i:i + 9 ].lower() = = '<!doctype' : # find the closing > gtpos = rawdata.find( '>' , i + 9 ) if gtpos = = - 1 : return - 1 self .handle_decl(rawdata[i + 2 :gtpos]) return gtpos + 1 else : return self .parse_bogus_comment(i) # Internal -- parse bogus comment, return length or -1 if not terminated # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state def parse_bogus_comment( self , i, report = 1 ): rawdata = self .rawdata if rawdata[i:i + 2 ] not in ( '<!' , '</' ): self .error( 'unexpected call to parse_comment()' ) pos = rawdata.find( '>' , i + 2 ) if pos = = - 1 : return - 1 if report: self .handle_comment(rawdata[i + 2 :pos]) return pos + 1 # Internal -- parse processing instr, return end or -1 if not terminated def parse_pi( self , i): rawdata = self .rawdata assert rawdata[i:i + 2 ] = = '<?' , 'unexpected call to parse_pi()' match = piclose.search(rawdata, i + 2 ) # > if not match: return - 1 j = match.start() self .handle_pi(rawdata[i + 2 : j]) j = match.end() return j # Internal -- handle starttag, return end or -1 if not terminated def parse_starttag( self , i): self .__starttag_text = None endpos = self .check_for_whole_start_tag(i) if endpos < 0 : return endpos rawdata = self .rawdata self .__starttag_text = rawdata[i:endpos] # Now parse the data between i+1 and j into a tag and attrs attrs = [] match = tagfind.match(rawdata, i + 1 ) assert match, 'unexpected call to parse_starttag()' k = match.end() self .lasttag = tag = match.group( 1 ).lower() while k < endpos: m = attrfind.match(rawdata, k) if not m: break attrname, rest, attrvalue = m.group( 1 , 2 , 3 ) if not rest: attrvalue = None elif attrvalue[: 1 ] = = '\'' = = attrvalue[ - 1 :] or \ attrvalue[: 1 ] = = '"' = = attrvalue[ - 1 :]: attrvalue = attrvalue[ 1 : - 1 ] if attrvalue: attrvalue = self .unescape(attrvalue) attrs.append((attrname.lower(), attrvalue)) k = m.end() end = rawdata[k:endpos].strip() if end not in ( ">" , "/>" ): lineno, offset = self .getpos() if "\n" in self .__starttag_text: lineno = lineno + self .__starttag_text.count( "\n" ) offset = len ( self .__starttag_text) \ - self .__starttag_text.rfind( "\n" ) else : offset = offset + len ( self .__starttag_text) self .handle_data(rawdata[i:endpos]) return endpos if end.endswith( '/>' ): # XHTML-style empty tag: <span attr="value" /> self .handle_startendtag(tag, attrs) else : self .handle_starttag(tag, attrs) if tag in self .CDATA_CONTENT_ELEMENTS: self .set_cdata_mode(tag) return endpos # Internal -- check to see if we have a complete starttag; return end # or -1 if incomplete. def check_for_whole_start_tag( self , i): rawdata = self .rawdata m = locatestarttagend.match(rawdata, i) if m: j = m.end() next = rawdata[j:j + 1 ] if next = = ">" : return j + 1 if next = = "/" : if rawdata.startswith( "/>" , j): return j + 2 if rawdata.startswith( "/" , j): # buffer boundary return - 1 # else bogus input self .updatepos(i, j + 1 ) self .error( "malformed empty start tag" ) if next = = "": # end of input return - 1 if next in ( "abcdefghijklmnopqrstuvwxyz=/" "ABCDEFGHIJKLMNOPQRSTUVWXYZ" ): # end of input in or before attribute value, or we have the # '/' from a '/>' ending return - 1 if j > i: return j else : return i + 1 raise AssertionError( "we should not get here!" ) # Internal -- parse endtag, return end or -1 if incomplete def parse_endtag( self , i): rawdata = self .rawdata assert rawdata[i:i + 2 ] = = "</" , "unexpected call to parse_endtag" match = endendtag.search(rawdata, i + 1 ) # > if not match: return - 1 gtpos = match.end() match = endtagfind.match(rawdata, i) # </ + tag + > if not match: if self .cdata_elem is not None : self .handle_data(rawdata[i:gtpos]) return gtpos # find the name: w3.org/TR/html5/tokenization.html#tag-name-state namematch = tagfind.match(rawdata, i + 2 ) if not namematch: # w3.org/TR/html5/tokenization.html#end-tag-open-state if rawdata[i:i + 3 ] = = '</>' : return i + 3 else : return self .parse_bogus_comment(i) tagname = namematch.group( 1 ).lower() # consume and ignore other stuff between the name and the > # Note: this is not 100% correct, since we might have things like # </tag attr=">">, but looking for > after tha name should cover # most of the cases and is much simpler gtpos = rawdata.find( '>' , namematch.end()) self .handle_endtag(tagname) return gtpos + 1 elem = match.group( 1 ).lower() # script or style if self .cdata_elem is not None : if elem ! = self .cdata_elem: self .handle_data(rawdata[i:gtpos]) return gtpos self .handle_endtag(elem) self .clear_cdata_mode() return gtpos # Overridable -- finish processing of start+end tag: <tag.../> def handle_startendtag( self , tag, attrs): self .handle_starttag(tag, attrs) self .handle_endtag(tag) # Overridable -- handle start tag def handle_starttag( self , tag, attrs): pass # Overridable -- handle end tag def handle_endtag( self , tag): pass # Overridable -- handle character reference def handle_charref( self , name): pass # Overridable -- handle entity reference def handle_entityref( self , name): pass # Overridable -- handle data def handle_data( self , data): pass # Overridable -- handle comment def handle_comment( self , data): pass # Overridable -- handle declaration def handle_decl( self , decl): pass # Overridable -- handle processing instruction def handle_pi( self , data): pass def unknown_decl( self , data): pass # Internal -- helper to remove special character quoting entitydefs = None def unescape( self , s): if '&' not in s: return s def replaceEntities(s): s = s.groups()[ 0 ] try : if s[ 0 ] = = "#" : s = s[ 1 :] if s[ 0 ] in [ 'x' , 'X' ]: c = int (s[ 1 :], 16 ) else : c = int (s) return unichr (c) except ValueError: return '&#' + s + ';' else : # Cannot use name2codepoint directly, because HTMLParser supports apos, # which is not part of HTML 4 import htmlentitydefs if HTMLParser.entitydefs is None : entitydefs = HTMLParser.entitydefs = { 'apos' :u "'" } for k, v in htmlentitydefs.name2codepoint.iteritems(): entitydefs[k] = unichr (v) try : return self .entitydefs[s] except KeyError: return '&' + s + ';' return re.sub(r "&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));" , replaceEntities, s) |
可以看出,源代码中handle_xxxxxx函数体均是空的,需要自己继承并添加处理内容;否则函数不作任何处理。
1. 获取标签属性
tag是的html标签,attrs是 (属性,值)元组(tuple)的列表(list).
如一个标签为:<input type="hidden" name="NXX" id="IDXX" value="VXX" />
那么它的attrs列表为[('type', 'hidden'), ('name', 'NXX'), ('id', 'IDXX'), ('value', 'VXX')]
HTMLParser自动将tag和attrs都转为小写。
下面给出的例子抽取了html中的所有链接:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
|
from HTMLParser import HTMLParser class MyHTMLParser(HTMLParser): def __init__( self ): HTMLParser.__init__( self ) self .links = [] def handle_starttag( self , tag, attrs): #print "Encountered the beginning of a %s tag" % tag if tag = = "a" : if len (attrs) = = 0 : pass else : for (variable, value) in attrs: if variable = = "href" : self .links.append(value) if __name__ = = "__main__" : html_code = """ <a href="www.google.com"> google.com</a> <A Href="www.pythonclub.org"> PythonClub </a> <A HREF = "www.sina.com.cn"> Sina </a> """ hp = MyHTMLParser() hp.feed(html_code) hp.close() print (hp.links) |
输出为:
1
|
[ 'www.google.com' , 'www.pythonclub.org' , 'www.sina.com.cn' ] |
如果想抽取图形链接:
<img src='http://www.google.com/intl/zh-CN_ALL/images/logo.gif' />
就要重定义 handle_startendtag( tag, attrs) 函数
2. 获取标签内容
test1.html文件内容如下:
1
2
3
4
5
6
7
8
|
< html > < head > < title > XHTML 与 HTML 4.01 标准没有太多的不同</ title > </ head > < body > i love you </ body > </ html > |
2.1 第一个例子
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
|
import HTMLParser class TitleParser(HTMLParser.HTMLParser): def __init__( self ): HTMLParser.HTMLParser.__init__( self ) # self.taglevels=[] self .handledtags = [ 'title' , 'body' ] self .processing = None def handle_starttag( self ,tag,attrs): print '--------------' print 'handle start func' ,tag def handle_endtag( self ,tag): print '================' print 'handle end func' ,tag if __name__ = = '__main__' : fd = open ( 'test1.html' ) tp = TitleParser() tp.feed(fd.read()) |
运行结果:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
- - - - - - - - - - - - - - handle start func html - - - - - - - - - - - - - - handle start func head - - - - - - - - - - - - - - handle start func title = = = = = = = = = = = = = = = = = = = = = = = handle end func title = = = = = = = = = = = = = = = = = = = = = = = handle end func head - - - - - - - - - - - - - - handle start func body = = = = = = = = = = = = = = = = = = = = = = = handle end func body = = = = = = = = = = = = = = = = = = = = = = = handle end func html |
相信大家已经看出来了,解析时碰到<***>,自动调用handle_starttag();碰到</***>,自动调用handle_endtag()
2.2 添加handle_data方法
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
import HTMLParser class TitleParser(HTMLParser.HTMLParser): def __init__( self ): HTMLParser.HTMLParser.__init__( self ) # self.taglevels=[] self .handledtags = [ 'title' , 'body' ] self .processing = None def handle_starttag( self ,tag,attrs): print '--------------' print 'handle start func' ,tag def handle_data( self ,data): print '####' print 'handle data func' if data = = '\n' : print r '\n' else : print data, def handle_endtag( self ,tag): print '=======================' print 'handle end func' ,tag if __name__ = = '__main__' : fd = open ( 'test1.html' ) tp = TitleParser() tp.feed(fd.read()) |
运行结果:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
|
- - - - - - - - - - - - - - handle start func html #### handle data func \n - - - - - - - - - - - - - - handle start func head #### handle data func \n - - - - - - - - - - - - - - handle start func title #### handle data func XHTML 与 HTML 4.01 标准没有太多的不同 = = = = = = = = = = = = = = = = = = = = = = = handle end func title #### handle data func \n = = = = = = = = = = = = = = = = = = = = = = = handle end func head #### handle data func \n - - - - - - - - - - - - - - handle start func body #### handle data func i love you = = = = = = = = = = = = = = = = = = = = = = = handle end func body #### handle data func \n = = = = = = = = = = = = = = = = = = = = = = = handle end func html |
说明:
- 每一个标签,无论<> 还是</>,均会调用handle_data()
- html中第一行、第二行分别为<html>和<head>,后面无具体数据,只有回车换行,所用调用handle_data(),打印结果为换行;</html></head>同理。
2.2 解析需要的内容
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
|
import HTMLParser class TitleParser(HTMLParser.HTMLParser): def __init__( self ): HTMLParser.HTMLParser.__init__( self ) self .handledtags = [ 'title' , 'body' ] self .processing = None self .data = [] def handle_starttag( self ,tag,attrs): if tag in self .handledtags: self .processing = tag def handle_data( self ,data): if self .processing: self .data.append(data) def handle_endtag( self ,tag): if tag = = self .processing: self .processing = None if __name__ = = '__main__' : fd = open ( 'test1.html' ) tp = TitleParser() tp.feed(fd.read()) for each in tp.data: print each |
运行结果:
1
2
3
|
XHTML 与 HTML 4.01 标准没有太多的不同 i love you |
2.3 解析豆瓣热门电影实例
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
|
#encoding=utf8 import urllib2 from HTMLParser import HTMLParser ''' <li class="ui-slide-item s" data-rater="6802" data-enough="True" data-intro="" data-actors="朴灿烈 / 袁姗姗 / 姜潮" data-director="金帝荣" data-region="中国大陆" data-duration="99分钟" data-ticket="https://movie.douban.com/subject/26564988/cinema/" data-trailer="https://movie.douban.com/subject/26564988/trailer" data-star="30" data-rate="5.3" data-release="2016" data-title="所以……和黑粉结婚了" data-dstat-viewport=".screening-bd" data-dstat-watch=".ui-slide-content" data-dstat-mode="click,expose" data-dstat-areaid="70_4"> ''' class MYPARSER(HTMLParser): def __init__( self ): HTMLParser.__init__( self ) self .movies = [] def handle_starttag( self ,tag,attrs): def _attr(attrlist,attrname): for each in attrlist: if attrname = = each[ 0 ]: return each[ 1 ] return None if tag = = 'li' and _attr(attrs, 'data-title' ): movie = {} movie[ 'actors' ] = _attr(attrs, 'data-actors' ) movie[ 'director' ] = _attr(attrs, 'data-director' ) movie[ 'duration' ] = _attr(attrs, 'data-dutation' ) movie[ 'title' ] = _attr(attrs, 'data-title' ) movie[ 'rate' ] = _attr(attrs, 'data-rate' ) self .movies.append(movie) def movieparser(url): headers = {} req = urllib2.Request(url,headers) s = urllib2.urlopen(req) myparser = MYPARSER() myparser.feed(s.read()) myparser.close() return myparser.movies if __name__ = = '__main__' : url = 'https://movie.douban.com/' movies = movieparser(url) for each in movies: print ( '%(title)s|%(rate)s|%(actors)s|%(director)s|%(duration)s' % each) |
运行结果:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
寒战 2 | 7.2 |郭富城 / 梁家辉 / 杨采妮|梁乐民| None 致青春·原来你还在这里| 3.9 |吴亦凡 / 刘亦菲 / 金世佳|周拓如| None 大鱼海棠| 6.6 |季冠霖 / 苏尚卿 / 许魏洲|梁旋| None 忍者神龟 2 :破影而出 Teenage Mutant Ninja Turtles: Out of the Shadows| 6.4 |梅根·福克斯 / 斯蒂芬·阿美尔 / 威尔·阿奈特|戴夫·格林| None 摇滚藏獒| 6.8 |郭德纲 / 郭麒麟 / 于谦|艾什·布兰农| None 发条城市| 6.4 |王宁 / 修睿 / 王自健|江涛| None 赏金猎人| 5.5 |李敏镐 / 钟汉良 / 唐嫣|申太罗| None 张震讲故事之合租屋| 4.8 |卢杉 / 傅亨 / 吴谨西|战越| None 惊天魔盗团 2 Now You See Me 2 | 6.6 |杰西·艾森伯格 / 伍迪·哈里森 / 戴夫·弗兰科|朱浩伟| None 海底总动员 2 :多莉去哪儿 Finding Dory| 7.4 |艾伦·德杰尼勒斯 / 艾伯特·布鲁克斯 / 艾德·奥尼尔|安德鲁·斯坦顿| None 独立日:卷土重来 Independence Day: Resurgence| 5.9 |利亚姆·海姆斯沃斯 / 杰夫·高布伦 / 比尔·普尔曼|罗兰·艾默里奇| None 丑小鸭历险记| 3.3 |朱可可 / 阿飞 / 夏倚轩|郑义| None 所以……和黑粉结婚了| 5.3 |朴灿烈 / 袁姗姗 / 姜潮|金帝荣| None 筷仙| 2.7 |胡影怡 / 朱璇 / 周骏|姬雨| None 古田会议| 2.9 |许铂岑 / 王韦智 / 王怡苏|陈健| None 魔轮| 4.8 |林心如 / 何润东 / 金世佳|王早| None |
代码说明:
- 代码中的文档字符串,是需要解析的文档,从豆瓣网抓取的
- 抓取的内容包括:标题、评分、演员、导演、时长