PctGL SERIES  
http://pctgl.cnblogs.com

 

vb6 基于字节码分析的网页源码分析类模块

类模块唯一入口:
webpage_Analyzer(网页源码字符串)

 

函数将分析所有标签:< xxx >   < / xxx > 的内容, 但不分析标签之间的内容, 比如  <p> xxxx </p> 之间的 xxxx , xxxx 属于单独提取的方法

提取的标签全部保存在 PG.htmlElementCollection.htmlElement 数组之中, 
元素之间的顺序关系按元素顺序分析,层级关系为 IndexParent (值为父元素在数组中的数组索引值)

 

这个代码分析新浪首页187K内容,提取全部标签,预计耗时 40~50ms:笔记本 17-11800H 
在头条上有个人说同样功能的RUST代码,仅耗时4ms,我认为纯扯淡

 

   1 Option Explicit
   2 
   3 Private Declare Function strcmp Lib "kernel32.dll" Alias "lstrcmpW" (ByVal StrPtr_lpString1 As Long, ByVal StrPtr_lpString2 As Long) As Long
   4 Private Declare Function strcmpi Lib "kernel32.dll" Alias "lstrcmpiW" (ByVal StrPtr_lpString1 As Long, ByVal StrPtr_lpString2 As Long) As Long
   5 Private Declare Function strlen Lib "kernel32.dll" Alias "lstrlenW" (ByVal StrPtr_lpString As Long) As Long
   6 
   7 Private Type html_bytes_loop_point
   8     
   9     htmlBytes()         As Byte
  10     IndexOfStart        As Long
  11     IndexOfCurrent      As Long
  12     IndexOfEnd          As Long
  13     
  14 End Type
  15 
  16 Private Type html_element_attribute
  17     
  18     AttribName        As String
  19     AttribValue       As String
  20     
  21 End Type
  22 
  23 Private Type html_element
  24     
  25     ElementName     As String
  26 '    elementIndex    As Long
  27     
  28     AttribCount     As Long
  29     Attribs()       As html_element_attribute
  30     
  31 '    IsCloseElement  As Boolean
  32     
  33 '    SizeOfTag       As Long
  34     
  35 '    IndexNext       As Long                             '// 思来想去,为了这两个属性而在解析时计算,开销非常大不合理,从而取消
  36 '    IndexPrev       As Long                             '// 同名元素的顺序索引在获取时判断计算,如获取 div.index=3 时,找到父元素,循环判断子元素 div ,同时计数
  37     IndexParent     As Long
  38 '    IndexCurrent    As Long
  39     
  40 '    IndexChildStart As Long
  41 '    IndexChildEnds  As Long
  42     
  43     tagTitle_from   As Long                             '// 表示 <div xxx....> 的全部内容位置,包括 <>
  44     tagTitle_to     As Long
  45     
  46     tagBody_from    As Long                             '// 表示 <div> xxxx </div>   xxxx 的全部内容
  47     tagBody_to      As Long
  48     
  49     tagfinal_from   As Long                             '// 表示 </div> 的全部内容位置,包括 <>
  50     tagfinal_to     As Long
  51     '// 通过以上3对开始-结束标记,可以准确获取多级嵌套元素结构下,排除所有标签,仅提取文本内容
  52     '// 如: <div xx1> abcd <div xx2> hijk <p> mnbc </p> qwer </div> nhytirs </div>
  53     '// 可以根据 div xx1 的元素结构信息,获取 div xx1 的标签位置和长度,包括的内容起始终止位置,闭合标签位置,再读取子元素的3个位置排除标签内容
  54     
  55 End Type
  56 
  57 Private Type html_element_collection
  58     
  59     rwOffsetElement             As Long
  60     SizeOfElement               As Long
  61     htmlElement()               As html_element                    '// 根据 xpath 分析器指定的路径解析出的元素
  62         
  63 End Type
  64 
  65 Private Type ThisClassSet
  66                                                                     '// 在一个函数中实现指定名称标签的跳过,为了避免某些特殊标签造成解析错误
  67     htmlElementStackSkipTagLeval As Long                            '// 表示要跳过的标签当前层级:要跳过的标签可能存在嵌套问题,如:<svg><svg></svg></svg>
  68     htmlElementStackSkipTag     As String                           '// 表示当前存在要跳过的标签,内容
  69     
  70     htmlElementStackTopIndex    As Long                             '// webpage_analyze_tag_attribs 解析标签时,维护元素父子关系的临时栈顶索引的变量
  71     htmlElementCollection       As html_element_collection
  72     
  73     SizeOfHtml                  As Long
  74     ByteOfHtml()                As Byte                             '// html code
  75     
  76     ByteSymbolTable(255)        As Byte
  77     
  78 End Type
  79 Dim PG                      As ThisClassSet
  80 
  81 'Dim htmlLabelArray()    As dom_element_in_xpath_analyzer
  82 'Dim htmlelements        As simple_htmlanalyzer_documentobjectmodel
  83 
  84 Event AnalyzeProcess(ByVal CurrentProcess As Long)
  85 
  86 Function webpage_analyze_GetElementInnerHtml() As String
  87     
  88 End Function
  89 
  90 
  91 Function webpage_analyze_GetElementInnerText() As String
  92     
  93 End Function
  94 
  95 Private Function webpage_analyze_skipcdata(webpageTag() As Byte, ByVal tagfrom As Long, ByVal htmlByteMax As Long) As Long
  96     '// 跳过CDATA内容(兼容不规范语法),返回CDATA结束后的位置
  97     '// 处理场景:<![CDATA[...]]>、<![  CDATA[...]] >、<![CDATA [ ... ] ]>等
  98     
  99     Dim iCount As Long
 100     '// 状态标记:跟踪CDATA起始的匹配进度
 101     Dim flag_cdata_bang As Long ' 已匹配"!"的位置(<![CDATA[中的!)
 102     Dim flag_cdata_bracket1 As Long ' 已匹配第一个"["的位置(<![CDATA[中的第一个[)
 103     Dim flag_cdata_C As Long ' 已匹配"C"的位置
 104     Dim flag_cdata_D As Long ' 已匹配"D"的位置
 105     Dim flag_cdata_A1 As Long ' 已匹配第一个"A"的位置
 106     Dim flag_cdata_T As Long ' 已匹配"T"的位置
 107     Dim flag_cdata_A2 As Long ' 已匹配第二个"A"的位置
 108     Dim flag_cdata_bracket2 As Long ' 已匹配第二个"["的位置(<![CDATA[中的第二个[)
 109     '// 状态标记:跟踪CDATA结束的匹配进度
 110     Dim flag_cdata_end1 As Long ' 已匹配第一个"]"的位置(]]>中的第一个])
 111     Dim flag_cdata_end2 As Long ' 已匹配第二个"]"的位置(]]>中的第二个])
 112     
 113     '// 从tagfrom开始(已跳过"<")
 114     iCount = tagfrom
 115     Do While iCount <= htmlByteMax
 116         '// 只处理有效字节(忽略高字节,和你的注释函数一致)
 117         If webpageTag(iCount + 1) = 0 Then
 118             Dim b As Byte
 119             b = webpageTag(iCount)
 120             
 121             '// 第一阶段:匹配CDATA起始(<![CDATA[)
 122             If flag_cdata_bracket2 = 0 Then ' 尚未完全匹配起始
 123                 Select Case b
 124                     '// 匹配"!"(<![CDATA[中的!)
 125                     Case 33: If flag_cdata_bang = 0 Then flag_cdata_bang = iCount
 126                     
 127                     '// 匹配第一个"["(<![CDATA[中的第一个[)
 128                     Case 91
 129                     If flag_cdata_bang > 0 And flag_cdata_bracket1 = 0 Then
 130                         flag_cdata_bracket1 = iCount
 131                     End If
 132                     
 133                     '// 匹配"C"(CDATA中的C)
 134                     Case 67:
 135                     If flag_cdata_bracket1 > 0 And flag_cdata_C = 0 Then
 136                         flag_cdata_C = iCount
 137                     End If
 138                     
 139                     '// 匹配"D"(CDATA中的D)
 140                     Case 68
 141                     If flag_cdata_C > 0 And flag_cdata_D = 0 Then
 142                         flag_cdata_D = iCount
 143                     End If
 144                     
 145                     '// 匹配第一个"A"(CDATA中的A)
 146                     Case 65
 147                     If flag_cdata_D > 0 And flag_cdata_A1 = 0 Then
 148                         flag_cdata_A1 = iCount
 149                     End If
 150                     
 151                     '// 匹配"T"(CDATA中的T)
 152                     Case 84
 153                     If flag_cdata_A1 > 0 And flag_cdata_T = 0 Then
 154                         flag_cdata_T = iCount
 155                     End If
 156                     
 157                     '// 匹配第二个"A"(CDATA中的A)
 158                     Case 65
 159                     If flag_cdata_T > 0 And flag_cdata_A2 = 0 Then
 160                         flag_cdata_A2 = iCount
 161                     End If
 162                     
 163                     '// 匹配第二个"["(<![CDATA[中的第二个[)
 164                     Case 91
 165                     If flag_cdata_A2 > 0 And flag_cdata_bracket2 = 0 Then
 166                         flag_cdata_bracket2 = iCount ' 起始匹配完成,开始找结束
 167                     End If
 168                     
 169                     '// 容错:忽略空格、换行、制表符(允许起始中插入这些字符)
 170                     Case 9, 10, 13, 32: ' do nothing
 171                     
 172                     '// 非CDATA起始字符,且尚未开始匹配,退出
 173                     Case Else
 174                     If flag_cdata_bang = 0 Then
 175                         webpage_analyze_skipcdata = 0
 176                         Exit Function
 177                     End If
 178                 End Select
 179             
 180             '// 第二阶段:匹配CDATA结束(]]>)
 181             Else
 182                 Select Case b
 183                     '// 匹配第一个"]"
 184                     Case 93
 185                     If flag_cdata_end1 = 0 Then
 186                         flag_cdata_end1 = iCount
 187                     End If
 188                     
 189                     '// 匹配第二个"]"
 190                     Case 93
 191                     If flag_cdata_end1 > 0 And flag_cdata_end2 = 0 Then
 192                         flag_cdata_end2 = iCount
 193                     End If
 194                     
 195                     '// 匹配">",且已找到两个"]",视为结束
 196                     Case 62
 197                     If flag_cdata_end2 > 0 Then
 198                         webpage_analyze_skipcdata = iCount ' 返回">"的位置
 199                         Exit Function
 200                     End If
 201                     
 202                     '// 容错:忽略空格、换行等(允许结束中插入这些字符)
 203                     Case 9, 10, 13, 32: ' do nothing
 204                     
 205                     '// 其他字符不影响,继续找结束符
 206                     Case Else: ' do nothing
 207                 End Select
 208             End If
 209         End If
 210         iCount = iCount + 2 ' 和你的注释函数一致,步长2
 211     Loop
 212     
 213     '// 遍历结束仍未找到结束符,视为未闭合(容错处理)
 214     webpage_analyze_skipcdata = htmlByteMax
 215 End Function
 216 
 217 Private Function webpage_analyze_skipremark(webpageTag() As Byte, ByVal tagfrom As Long, ByVal htmlByteMax As Long) As Long
 218     
 219     '// 跳过注释内容,返回注释的最后一个位置
 220     
 221     Dim iCount          As Long
 222     
 223     Dim flag_remark     As Long     '// 注释标志符  !
 224     Dim flag_hyphen     As Long     '// 注释终止符  -
 225     Dim flag_double_hyphen  As Long '//             - -
 226     
 227     
 228     For iCount = tagfrom To htmlByteMax Step 2
 229     
 230         If webpageTag(iCount + 1) = 0 Then
 231         
 232             '// 查找 <
 233             Select Case PG.ByteOfHtml(iCount)
 234             
 235                    Case 9, 10, 13, 32
 236                         '// 容错,可忽略内容, 遇到 tab/changeline/enter/space 时,如果前一个字符是 - 则把 - 的位置 + 1,与下一个字符组成一个连续的双字符
 237                         '//                   由之前的 - - 变成 -- 连续的双横线,以标志双横线标志, 如果事双横线遇到此类特殊打断符号, 则把双横线整体后移
 238                         '//                   整体移动后,可能形成 --> 表示,注释结束
 239                         If flag_hyphen Then
 240                             '// 如果上一个字符是 - ,本次字符是可忽略的,进行容错处理
 241                             If iCount - flag_hyphen = 2 Then flag_hyphen = iCount
 242                         End If
 243                             
 244                         If flag_double_hyphen Then
 245                             '// 如果前2个字符是已识别的 - - ,忽略这次的容错字符
 246                             If iCount - flag_double_hyphen = 2 Then flag_double_hyphen = iCount
 247                         End If
 248                             
 249                             
 250                         '// !  找到!标志,表示是注释内容
 251                    Case 33: If flag_remark = 0 Then flag_remark = iCount
 252                         
 253                    Case 45
 254                         '// -
 255                             '// 连续两次找到 - , 保存和计算 - ,连续的2个 - 做双 - 标志
 256                             If iCount - flag_hyphen = 2 Then flag_double_hyphen = iCount
 257                             
 258                             flag_hyphen = iCount
 259                         
 260                         
 261                    Case 62
 262                         '// >  检查上次双 - 是否是最近的一次,如是表示注释结尾
 263                         If iCount - flag_double_hyphen = 2 Then webpage_analyze_skipremark = iCount: Exit Function
 264                         
 265                         
 266                         
 267                    Case Else
 268                         '// 非注释内容, 退出
 269                         If flag_remark = 0 Then webpage_analyze_skipremark = 0: Exit Function
 270                         
 271             End Select
 272             
 273         End If
 274         
 275     Next
 276     
 277     
 278     
 279 End Function
 280 
 281 Private Function webpage_analyze_tag_attribs_skiptagsetstack(ByVal strTagName As String, htmlBytes() As Byte, ByVal bytefrom As Long) As Long
 282     
 283     '// 根据指定的标签名,直接查找对应的闭合标签; 相当于跳过标签
 284     
 285     Dim iCount              As Long
 286     
 287 '    Dim flag_hyphen     As Long     '// 注释终止符  -
 288 '    Dim flag_double_hyphen  As Long '//             - -
 289     
 290     webpage_analyze_tag_attribs_skiptagsetstack = 1
 291     
 292     PG.htmlElementStackSkipTagLeval = 1
 293     
 294     Select Case Len(strTagName)
 295     
 296         Case 1
 297             If strcmpi(ByVal StrPtr(strTagName), ByVal StrPtr("!")) = 0 Then
 298                 webpage_analyze_tag_attribs_skiptagsetstack = webpage_analyze_skipremark(htmlBytes, bytefrom, PG.SizeOfHtml)
 299             End If
 300             
 301             PG.htmlElementStackSkipTagLeval = 0
 302             
 303         Case 3: If strcmpi(ByVal StrPtr(strTagName), ByVal StrPtr("svg")) = 0 Then PG.htmlElementStackSkipTag = "svg"
 304         Case 4: If strcmpi(ByVal StrPtr(strTagName), ByVal StrPtr("math")) = 0 Then PG.htmlElementStackSkipTag = "math"
 305         Case 5: If strcmpi(ByVal StrPtr(strTagName), ByVal StrPtr("style")) = 0 Then PG.htmlElementStackSkipTag = "style"
 306         Case 6: If strcmpi(ByVal StrPtr(strTagName), ByVal StrPtr("script")) = 0 Then PG.htmlElementStackSkipTag = "script"
 307         Case 8: If strcmpi(ByVal StrPtr(strTagName), ByVal StrPtr("noscript")) = 0 Then PG.htmlElementStackSkipTag = "noscript"
 308     '    ElseIf lstrcmpi(strTagName, "[cdata[") = 0 Then: PG.htmlElementStackSkipTag = ""
 309         '// 以 <![CDATA[ 开头,以 ]]> 结尾,,,, <![CDATA[ 这里的内容会被原样保留,包括 <、>、& 等特殊字符 ]]>
 310             
 311         Case Else: webpage_analyze_tag_attribs_skiptagsetstack = 0: PG.htmlElementStackSkipTagLeval = 0
 312         
 313     End Select
 314     
 315 End Function
 316 
 317 Private Function webpage_analyze_elements_realloc() As Long
 318     
 319     '// 获取一个新的可用的的数组元素索引值
 320     
 321     With PG.htmlElementCollection
 322         
 323         webpage_analyze_elements_realloc = .rwOffsetElement
 324         
 325         If .rwOffsetElement >= .SizeOfElement Then
 326             .SizeOfElement = .SizeOfElement + 1000
 327             ReDim Preserve .htmlElement(PG.htmlElementCollection.SizeOfElement)
 328             
 329         End If
 330         
 331         .rwOffsetElement = .rwOffsetElement + 1
 332         
 333     End With
 334         
 335 End Function
 336 
 337 Private Function webpage_analyze_tag_attribs_getlastmatchelement(ByVal tagName As String) As Long
 338     
 339     '// 根据指定的闭合标签名,获取对应的引导标签数组索引值
 340     
 341     Dim iCount      As Long
 342     
 343     '// 默认值 = -1, 未找到匹配的引导标签
 344     webpage_analyze_tag_attribs_getlastmatchelement = -1
 345     
 346     For iCount = PG.htmlElementCollection.rwOffsetElement - 1 To 0 Step -1
 347         
 348         '// 比较标签名
 349         If strcmpi(ByVal StrPtr(tagName), ByVal StrPtr(PG.htmlElementCollection.htmlElement(iCount).ElementName)) = 0 Then
 350             
 351             '// 找到匹配的标签名后, 检查该元素结束标签数据值是否存在
 352             If PG.htmlElementCollection.htmlElement(iCount).tagfinal_to = 0 Then webpage_analyze_tag_attribs_getlastmatchelement = iCount: Exit For
 353         
 354         End If
 355         
 356     Next
 357     
 358 End Function
 359 
 360 Private Function webpage_analyze_tag_attribs(WebPageTagfield() As String, htmlBytes() As Byte, ByVal bytefrom As Long, ByVal byteto As Long) As Long
 361     
 362     
 363     ' 状态机状态定义
 364     Const STATE_WAIT_TAG_NAME       As Long = 0      ' 等待标签名
 365     Const STATE_WAIT_ATTR_NAME      As Long = 1      ' 等待属性名
 366     Const STATE_WAIT_OPERATOR_DH    As Long = 2      ' 等待操作符(=)
 367     Const STATE_WAIT_OPERATOR_MH    As Long = 3      ' 等待操作符(:)
 368     
 369     Const STATE_WAIT_ATTR_VALUE     As Long = 4    ' 等待属性值
 370     
 371     
 372     Dim STATE_INDICATOR             As Long
 373     Dim iCount                      As Long
 374     Dim BoundOfTags                 As Long
 375     
 376 '    Dim domElement                  As html_element
 377     
 378     Dim IndexElementlastMatch       As Long         ' 逆向查找到的匹配标签索引
 379     Dim IndexElementAlloc           As Long         ' 新申请的标签项目索引
 380     
 381     Dim rwOffsetAttrib            As Long         '// 属性数组读写点
 382     
 383     
 384 '    Dim aii As Long
 385 '    aii = UBound(WebPageTagfield)
 386 '
 387 '    Dim na As Long
 388 '    For na = 0 To aii
 389 '        Debug.Print WebPageTagfield(na); " & "
 390 '    Next
 391 '    Debug.Print vbCrLf, Timer, "----------------------------------------------------------"
 392 '
 393 '    Exit Function
 394     
 395     
 396 
 397     
 398     
 399     BoundOfTags = UBound(WebPageTagfield)
 400     
 401 '    ReDim domElement.Attribs(BoundOfTags)
 402     
 403     For iCount = 0 To BoundOfTags
 404         
 405         If Len(WebPageTagfield(iCount)) Then
 406             
 407             '// 状态指示器
 408             Select Case STATE_INDICATOR
 409                    
 410                    '//  等待获取标签名
 411                    Case STATE_WAIT_TAG_NAME
 412 
 413                         '// 检查标签范围内第1个字符,如果是 / ,表示闭合标签
 414                         If lstrcmp(WebPageTagfield(iCount), "/") = 0 Then
 415                             
 416                             '// 检查是否存在跳过栈:要跳过的标签名, 先检查跳过栈内容是否存在,此一步是为了执行效率
 417                             If Len(PG.htmlElementStackSkipTag) Then
 418                                 
 419                                 '// 此一步是为了执行效率,尽量较少字符串比对(慢)
 420                                 If Len(PG.htmlElementStackSkipTag) = Len(WebPageTagfield(iCount + 1)) Then
 421                                 
 422                                     '// 要跳过的标签与本次即将保存的标签名字不同时则退出 = 跳过
 423                                     If strcmpi(ByVal StrPtr(WebPageTagfield(iCount + 1)), ByVal StrPtr(PG.htmlElementStackSkipTag)) <> 0 Then
 424                                         
 425                                         Exit Function
 426                                         
 427                                     Else
 428                                         '// 检查嵌套情况
 429                                         PG.htmlElementStackSkipTagLeval = PG.htmlElementStackSkipTagLeval - 1
 430                                         
 431                                     End If
 432                                 
 433                                 End If
 434                             
 435                             End If
 436                             
 437                             '// 不存在嵌套或嵌套解除
 438                             If PG.htmlElementStackSkipTagLeval Then Exit Function
 439                             
 440                             '// 找到闭合标签时, 检查匹配的引导标签
 441                             IndexElementlastMatch = webpage_analyze_tag_attribs_getlastmatchelement(WebPageTagfield(iCount + 1))
 442                             
 443                             If IndexElementlastMatch <> -1 Then
 444                                 
 445                                 '// 找到引导标签后更新标签全部信息, 为找到时( -1 ) 直接丢弃
 446                                 With PG.htmlElementCollection.htmlElement(IndexElementlastMatch)
 447                                     
 448                                     .tagBody_to = bytefrom - 1
 449                                     .tagfinal_from = bytefrom
 450                                     .tagfinal_to = byteto
 451                                     
 452                                     '// 找到闭合标签时,恢复栈顶元素为栈顶父元素
 453                                     PG.htmlElementStackTopIndex = .IndexParent
 454                                     
 455                                     '// 如果顺利保存闭合标签, 则将跳过栈清空
 456                                     PG.htmlElementStackSkipTag = vbNullString
 457                                     
 458                                     PG.htmlElementStackSkipTagLeval = 0
 459                                     
 460                                 End With
 461                                 
 462                             End If
 463                             
 464                             Exit Function
 465                             
 466                             
 467                         ElseIf WebPageTagfield(iCount) = "!" Then
 468                             '// 检查注释
 469                             
 470                             
 471 '                        ElseIf lstrcmpi(WebPageTagfield(iCount), "script") = 0 Then
 472 '
 473 '                        ElseIf lstrcmpi(WebPageTagfield(iCount), "style") = 0 Then
 474 '
 475 '                        ElseIf lstrcmpi(WebPageTagfield(iCount), "noscript") = 0 Then
 476 '
 477 '                        ElseIf lstrcmpi(WebPageTagfield(iCount), "svg") = 0 Then
 478 '
 479 '                        ElseIf lstrcmpi(WebPageTagfield(iCount), "math") = 0 Then
 480                             'canvas  <applet>   <embed>
 481                             
 482                         Else
 483                             '// 正常提取标签名
 484                             
 485                             '// 检查是否存在 跳过 标签
 486                             If Len(PG.htmlElementStackSkipTag) Then
 487                                 
 488                                 '// 检查本次标签与跳过标签是否长度一致
 489                                 If Len(PG.htmlElementStackSkipTag) = Len(WebPageTagfield(iCount)) Then
 490                                     
 491                                     '// 比对标签是否完全一致, 如果是嵌套层级 + 1
 492                                     If strcmpi(ByVal StrPtr(PG.htmlElementStackSkipTag), ByVal StrPtr(WebPageTagfield(iCount))) = 0 Then PG.htmlElementStackSkipTagLeval = PG.htmlElementStackSkipTagLeval + 1
 493                                     
 494                                 End If
 495                                 
 496                                 Exit Function
 497                                 
 498                             End If
 499                             
 500                             '// 获取新建标签,获取索引值
 501                             IndexElementAlloc = webpage_analyze_elements_realloc
 502                             
 503                             '// 新标签元素属性赋值
 504                             With PG.htmlElementCollection.htmlElement(IndexElementAlloc)
 505                                 
 506                                 '// 标签名
 507                                 .ElementName = WebPageTagfield(iCount)
 508                                 
 509                                 '// 引导标签的数据范围
 510                                 .tagTitle_from = bytefrom
 511                                 .tagTitle_to = byteto
 512                                 
 513                                 '// 标签内容的数据起始范围
 514                                 .tagBody_from = byteto + 1
 515                                 
 516                                 '// 当前标签的父标签元素是栈顶元素
 517                                 .IndexParent = PG.htmlElementStackTopIndex
 518                                 
 519                                 '// 每一个新标签都设置为栈顶标签
 520                                 PG.htmlElementStackTopIndex = IndexElementAlloc
 521                                 
 522                                 
 523 '                                '// 此处检查标签名,配置跳过栈 ========>>>> 注释标签还需要调整 数据范围, cdata 标签???  ■■ 自闭和标签???
 524 '                                If webpage_analyze_tag_attribs_skiptagsetstack(.ElementName, htmlBytes, bytefrom) Then Exit Function
 525                                 
 526                                 '// 此处检查标签名,配置跳过栈 ========>>>> ! 开头的单独处理
 527                                 webpage_analyze_tag_attribs_skiptagsetstack .ElementName, htmlBytes, bytefrom
 528                                 
 529                                 '// 初始化标签元素属性数组
 530                                 .AttribCount = 100
 531                                 ReDim .Attribs(.AttribCount)
 532                                 
 533                                 
 534                             End With
 535                             
 536                         End If
 537                         
 538                         
 539                         '// 状态指示器, 只想属性获取
 540                         STATE_INDICATOR = STATE_WAIT_ATTR_NAME
 541 
 542                    
 543                    
 544                    
 545                    Case STATE_WAIT_ATTR_NAME
 546                    
 547                         If iCount = BoundOfTags Then
 548                             
 549                             '// 这可能是自闭合标签, 这个 / 是最后一个字符
 550                             If WebPageTagfield(iCount) = "/" Then
 551                                 
 552                                 '// 找到引导标签后更新标签全部信息, 为找到时( -1 ) 直接丢弃
 553                                 With PG.htmlElementCollection.htmlElement(IndexElementAlloc)
 554                                     
 555                                     '// 自闭合标签的数据范围都是完全一致的
 556                                     .tagBody_from = bytefrom
 557                                     .tagBody_to = byteto
 558                                     .tagfinal_from = bytefrom
 559                                     .tagfinal_to = byteto
 560                                     
 561                                     '// 找到闭合标签时,恢复栈顶元素为栈顶父元素
 562                                     PG.htmlElementStackTopIndex = .IndexParent
 563                                     
 564                                     '// 如果顺利保存闭合标签, 则将跳过栈清空
 565                                     PG.htmlElementStackSkipTag = vbNullString
 566                                     
 567                                     PG.htmlElementStackSkipTagLeval = 0
 568                                     
 569                                 End With
 570                                 
 571                             End If
 572                             
 573                         Else
 574                             
 575                             '// 保存、识别属性名
 576                             PG.htmlElementCollection.htmlElement(IndexElementAlloc).Attribs(rwOffsetAttrib).AttribName = WebPageTagfield(iCount)
 577 '                            rwOffsetAttrib = rwOffsetAttrib + 1
 578                             
 579                             '// 设置下次状态为识别 =
 580                             STATE_INDICATOR = STATE_WAIT_OPERATOR_DH
 581                             
 582                         End If
 583                    
 584                    
 585                    
 586                    Case STATE_WAIT_OPERATOR_DH
 587                         
 588                         '// 如果本次识别的内容为 = , 符合 属性名 = 属性值 的标准, 配置下一此识别内容为属性值
 589                         If WebPageTagfield(iCount) = "=" Then
 590                         
 591                             STATE_INDICATOR = STATE_WAIT_ATTR_VALUE
 592                         
 593                         Else
 594                             
 595                             '// 本次识别的内容不为 = 时, 视为无值属性, 将本次内容视为新属性开始, 保存属性名
 596                             '// 此时状态 = WAIT NAME
 597                             '   STATE_INDICATOR = STATE_WAIT_ATTR_NAME
 598                             
 599                             rwOffsetAttrib = rwOffsetAttrib + 1
 600                             PG.htmlElementCollection.htmlElement(IndexElementAlloc).Attribs(rwOffsetAttrib).AttribName = WebPageTagfield(iCount)
 601                             
 602                             STATE_INDICATOR = STATE_WAIT_OPERATOR_DH
 603                             
 604                         End If
 605                         
 606                         
 607 
 608                         
 609                    Case STATE_WAIT_ATTR_VALUE
 610                                                 
 611                         '// 本次内容应为属性值, 识别,保存
 612                         PG.htmlElementCollection.htmlElement(IndexElementAlloc).Attribs(rwOffsetAttrib).AttribValue = WebPageTagfield(iCount)
 613                         
 614                         '// 检查操作符, 可能遇到非规范的 style = "color:red" ,由于缺少双引号导致被拆分识别
 615                         If (iCount + 2 <= BoundOfTags) Then
 616                             If (WebPageTagfield(iCount + 1) = ":") Then
 617                                                                 
 618                                 PG.htmlElementCollection.htmlElement(IndexElementAlloc).Attribs(rwOffsetAttrib).AttribValue = WebPageTagfield(iCount) & WebPageTagfield(iCount + 1) & WebPageTagfield(iCount + 2)
 619                                 
 620                                 iCount = iCount + 2
 621                                 
 622                             End If
 623                         End If
 624                         
 625                         '// 属性名/属性值提取完成, 将属性数组的读写指针指向下一个
 626                         rwOffsetAttrib = rwOffsetAttrib + 1
 627                         
 628                         '// 循环识别保存属性, 重置状态为属性循环
 629                         STATE_INDICATOR = STATE_WAIT_ATTR_NAME
 630                         
 631                         
 632 '                   Case STATE_WAIT_OPERATOR_MH
 633 '
 634 '                        If WebPageTagfield(iCount) = ":" Then
 635 '
 636 '                            domElement.Attribs(domElement.AttribCount).AttribValue = WebPageTagfield(iCount) & ":" & WebPageTagfield(iCount + 1)
 637 '                            iCount = iCount + 2
 638 '
 639 '                        Else
 640 '                            iCount = iCount - 1
 641 '
 642 '                        End If
 643 '
 644 '                        STATE_INDICATOR = STATE_WAIT_ATTR_NAME
 645                         
 646             End Select
 647             
 648         Else
 649             MsgBox "找到一个空的元素"
 650         End If
 651         
 652         
 653     Next
 654     
 655     If PG.htmlElementCollection.rwOffsetElement Then ReDim Preserve PG.htmlElementCollection.htmlElement(IndexElementAlloc).Attribs(rwOffsetAttrib - 1): PG.htmlElementCollection.htmlElement(IndexElementAlloc).AttribCount = rwOffsetAttrib
 656     
 657     Debug.Print Timer, "标签名", PG.htmlElementCollection.htmlElement(IndexElementAlloc).ElementName, "attirbs:", PG.htmlElementCollection.htmlElement(IndexElementAlloc).AttribCount
 658     
 659     
 660 End Function
 661 
 662 
 663 Private Function webpage_analyze_getstring(webpageTag() As Byte, ByVal bytefrom As Long, ByVal byteto As Long) As String
 664     
 665     If byteto - bytefrom > 1 Then
 666         
 667         'byteto = (byteto Or 1) + 1 '// byte align to char , byteto or 1 = 判断奇偶值, 偶数 or 1 = 奇数, 奇数 or 1 = 奇数 , + 1 = 下一个字符的第一个字节位置
 668         
 669         webpage_analyze_getstring = Space((byteto - bytefrom) \ 2)
 670         
 671         CopyMemory ByVal StrPtr(webpage_analyze_getstring), webpageTag(bytefrom), byteto - bytefrom
 672         
 673     End If
 674     
 675 End Function
 676 
 677 Private Function webpage_analyze_tag(webpageTag() As Byte, ByVal tagfrom As Long, ByVal tagto As Long) As Long
 678 
 679     
 680     Dim iCount              As Long
 681     
 682     
 683     If tagto - tagfrom = 2 Then Exit Function       '// 没有具体内容, 形式为: <>
 684     
 685 '    With webpage_analyze_tag
 686 '
 687 ''        .tagHtml_from = tagfrom
 688 ''        .tagHtml_to = tagto
 689 '
 690 '        ReDim .Attribs(100)
 691 '
 692 '    End With
 693     
 694     
 695     Dim tagAttribField()        As String
 696     Dim psAttribField           As Long
 697     
 698     Dim flag_tag_leftStart      As Long
 699     Dim flag_DanYinHao          As Long         '// 单引号 标志位
 700     Dim flag_ShuangYinHao       As Long         '// 双引号 标志位
 701 '    Dim flag_Space              As Long         '// 空格 标志位
 702     
 703 
 704     ReDim tagAttribField(tagto - tagfrom)
 705     
 706     For iCount = tagfrom + 2 To tagto Step 2
 707         
 708 '        If webpageTag(iCount + 1) = 0 Then
 709             
 710 '            If iCount = 1366 Then Stop
 711 '            If iCount = 1360 Then Stop
 712             
 713             Select Case PG.ByteSymbolTable(webpageTag(iCount))
 714                    
 715                    '// 13, 10, 9 32,58 ,enter,changeline,tab,space, >, : ,
 716                    Case 1
 717                         '// 忽略在标签字符前的中断符,实现一定的容错处理
 718                         
 719                         If webpageTag(iCount + 1) = 0 Then
 720                         
 721                             '// 如果不在引号范围内 or 如果遇到结尾符号: >
 722                             If ((flag_DanYinHao = 0) And (flag_ShuangYinHao = 0)) Or (iCount = tagto) Then
 723                                 
 724                                 '// 如果是字段结尾
 725                                 If flag_tag_leftStart Then
 726                                     
 727                                     tagAttribField(psAttribField) = webpage_analyze_getstring(webpageTag, flag_tag_leftStart, iCount)
 728                                     psAttribField = psAttribField + 1
 729                                     
 730                                     webpage_analyze_flag_setbitszero flag_tag_leftStart, flag_DanYinHao, flag_ShuangYinHao
 731                                     
 732                                 End If
 733                                 
 734                                 
 735                                 '// : , =
 736                                 If (webpageTag(iCount) = 58) Or (webpageTag(iCount) = 61) Then
 737                                 
 738                                     tagAttribField(psAttribField) = String(1, webpageTag(iCount))
 739                                     psAttribField = psAttribField + 1
 740     
 741                                     webpage_analyze_flag_setbitszero flag_tag_leftStart, flag_DanYinHao, flag_ShuangYinHao
 742     
 743                                 End If
 744                                 
 745                             ElseIf (flag_DanYinHao) Or (flag_ShuangYinHao) Then
 746                                 If flag_tag_leftStart = 0 Then flag_tag_leftStart = iCount
 747                                 
 748                             End If
 749                         
 750                         End If
 751                    
 752                    '// 34 双引号
 753                    Case 2
 754                         If webpageTag(iCount + 1) = 0 Then
 755                             '// 敏感引号类型, 对于单双引号套双单引号的情况, 只对第1个引号类型敏感
 756                             If flag_DanYinHao = 0 Then
 757                             
 758                                 If flag_ShuangYinHao Then
 759                                     
 760                                     '// 如果是字段结尾
 761 '                                    If flag_tag_leftStart Then
 762     
 763 '                                        tagAttribField(psAttribField) = webpage_analyze_getstring(webpageTag, flag_tag_leftStart, iCount)
 764                                         tagAttribField(psAttribField) = webpage_analyze_getstring(webpageTag, flag_ShuangYinHao + 2, iCount)
 765                                         psAttribField = psAttribField + 1
 766                                         
 767                                         webpage_analyze_flag_setbitszero flag_tag_leftStart, flag_DanYinHao, flag_ShuangYinHao
 768                                         
 769 '                                    End If
 770                                     
 771                                     
 772                                     
 773                                 Else
 774                                     flag_ShuangYinHao = iCount
 775                                 End If
 776                             End If
 777                         End If
 778                    
 779                    
 780                    '// 39 单引号
 781                    Case 3
 782                         If webpageTag(iCount + 1) = 0 Then
 783                    
 784                             '// 敏感引号类型, 对于单双引号套双单引号的情况, 只对第1个引号类型敏感
 785                             If flag_ShuangYinHao = 0 Then
 786                             
 787                                 If flag_DanYinHao Then
 788                                     
 789 '                                    '// 如果是字段结尾
 790 '                                    If flag_tag_leftStart Then
 791                                         
 792 '                                        tagAttribField(psAttribField) = webpage_analyze_getstring(webpageTag, flag_tag_leftStart, iCount)
 793                                         tagAttribField(psAttribField) = webpage_analyze_getstring(webpageTag, flag_DanYinHao + 2, iCount)
 794                                         psAttribField = psAttribField + 1
 795                                         
 796                                         webpage_analyze_flag_setbitszero flag_tag_leftStart, flag_DanYinHao, flag_ShuangYinHao
 797                                         
 798 '                                    End If
 799                                     
 800                                     
 801                                 Else
 802                                     flag_DanYinHao = iCount
 803                                 End If
 804                             End If
 805                             
 806                         End If
 807                         
 808                    
 809                    '// /  闭合标签
 810                    Case 4
 811                         If webpageTag(iCount + 1) = 0 Then
 812                         
 813                             If (flag_tag_leftStart + flag_DanYinHao + flag_ShuangYinHao) = 0 Then
 814                                                             
 815                                 tagAttribField(psAttribField) = "/"
 816                                 psAttribField = psAttribField + 1
 817                             
 818                             End If
 819                             
 820                         End If
 821                         
 822                         
 823                    '// !
 824                    Case 5
 825                         If webpageTag(iCount + 1) = 0 Then
 826                         
 827                             If (flag_tag_leftStart + flag_DanYinHao + flag_ShuangYinHao) = 0 Then
 828                                                             
 829                                 tagAttribField(psAttribField) = "!"
 830                                 psAttribField = psAttribField + 1
 831                             
 832                             End If
 833                             
 834                         End If
 835                         
 836                    
 837                    '// a~z:97 To 122, A~Z:65 To 90, 0~9:48 To 57, -:45,_:95,
 838                    Case 6, 7, 0
 839                         
 840                         If flag_tag_leftStart = 0 Then flag_tag_leftStart = iCount
 841                         
 842                         
 843             
 844                    '//Case 7   '// 应单独判断数字类型作为前导字符的情况: 如果没有字母前导/没有引号前导,则标签或属性无效
 845                         
 846             
 847             End Select
 848         
 849 '        End If
 850         
 851     Next
 852     
 853     
 854     If psAttribField Then ReDim Preserve tagAttribField(psAttribField - 1)
 855     
 856     webpage_analyze_tag = webpage_analyze_tag_attribs(tagAttribField, webpageTag, tagfrom, tagto)
 857     
 858     
 859 End Function
 860 
 861 
 862 
 863 Private Sub webpage_analyze_flag_setbits(ByVal exp1 As Long, ByVal exp2 As Long, flagVar As Long, ByVal bitsVal As Long)
 864     '// 标志位带条件置位
 865     If (exp1) Or (exp2) Then flagVar = bitsVal
 866 End Sub
 867 
 868 Private Sub webpage_analyze_flag_setbitszero(exp1 As Long, Optional exp2 As Long, Optional exp3 As Long, Optional exp4 As Long, Optional exp5 As Long, Optional exp6 As Long, Optional exp7 As Long)
 869     '// 标志位全部清 0
 870     exp1 = 0: exp2 = 0: exp3 = 0: exp4 = 0: exp5 = 0: exp6 = 0: exp7 = 0
 871 End Sub
 872 
 873 '// 分析web页面源码
 874 Function webpage_Analyzer(ByVal strWebPage As String) As Long
 875     
 876     Dim mulBitsBuffer()         As Byte         '// 提取的标签字节码
 877     
 878     '// 获取webpage字节码,,缓存 html code, 保存 uboudund(html byte code)
 879     PG.ByteOfHtml = strWebPage
 880     
 881     '// 缓存webpage源码总字节量
 882     PG.SizeOfHtml = UBound(PG.ByteOfHtml)
 883 
 884     Dim iCount                  As Long         '// 循环计次
 885     
 886     Dim flag_left_FangKuoHao    As Long         '// 左 方括号标志位
 887     Dim flag_DanYinHao          As Long         '// 单引号 标志位
 888     Dim flag_ShuangYinHao       As Long         '// 双引号 标志位
 889     Dim flag_Space              As Long         '// 空格 标志位
 890     
 891     Dim flag_enter              As Long
 892     Dim flag_tab                As Long
 893     Dim flag_changeline         As Long
 894     
 895     Dim flag_remarkskippoint    As Long
 896     
 897 '    Dim ByteSymbolTable(255)   As Byte
 898 '
 899 ''    <    "   space  enter   changeline tab  '   >
 900 ''    60, 34,   32,    13,        10,     9, 39,  62
 901 '    PG.ByteSymbolTable(9) = 1
 902 '    PG.ByteSymbolTable(10) = 1
 903 '    PG.ByteSymbolTable(13) = 1
 904 '    PG.ByteSymbolTable(32) = 1
 905 '    PG.ByteSymbolTable(34) = 1
 906 '    PG.ByteSymbolTable(39) = 1
 907 '    PG.ByteSymbolTable(60) = 1
 908 '    PG.ByteSymbolTable(62) = 1
 909     
 910     
 911     flag_left_FangKuoHao = -1
 912     
 913     For iCount = 0 To PG.SizeOfHtml Step 2
 914     
 915         If PG.ByteOfHtml(iCount + 1) = 0 Then
 916         
 917             '// 查找 <
 918             Select Case PG.ByteOfHtml(iCount)
 919             
 920                    Case 60
 921                 
 922                         '// 未遇到过 <
 923                         If flag_left_FangKuoHao = -1 Then
 924                             
 925 '                            '// 检查是否是注释
 926 '                            flag_remarkskippoint = webpage_analyze_skipremark(PG.ByteOfHtml, iCount + 2, PG.SizeOfHtml)
 927 '
 928 '                            '// 检查注释,如是,则跳过         '// 非注释内容                     '// 注释内容,跳过注释
 929 '                            If flag_remarkskippoint = 0 Then flag_left_FangKuoHao = iCount Else iCount = flag_remarkskippoint
 930                             
 931                             flag_left_FangKuoHao = iCount
 932                             
 933                         
 934                         '// 每次找到 <  都直接找到 > ,忽略过程中遇到的错误的 < (不包括引号范围内的) ,在最终的 tag 解析过程中,检查语法,并返回下一个循环点,为 icount 使用
 935                         
 936                         
 937 '                        '// 在遇到过 < 但没有被引号包括时: 放弃之前的错误标签类型
 938 '                        Else
 939 '                            If (flag_ShuangYinHao = 0) Or (flag_DanYinHao = 0) Then
 940 '
 941 '                                '// 标志位 清0
 942 '                                webpage_analyze_flag_setbitszero flag_left_FangKuoHao, flag_DanYinHao, flag_ShuangYinHao, flag_Space, flag_enter, flag_changeline, flag_tab
 943 '
 944 '                                '// 标签识别起始位 置位
 945 '                                flag_left_FangKuoHao = iCount
 946 '
 947 '                            End If
 948                             
 949                         End If
 950     '        End If
 951             
 952                     '// 查找 "
 953                     Case 34
 954                         If flag_left_FangKuoHao > -1 Then
 955                             '// 敏感引号类型, 对于单双引号套双单引号的情况, 只对第1个引号类型敏感
 956                             If flag_DanYinHao = 0 Then
 957                             
 958                                 If flag_ShuangYinHao Then
 959                                     flag_ShuangYinHao = 0
 960                                 Else
 961                                     flag_ShuangYinHao = iCount
 962                                 End If
 963                             End If
 964                         End If
 965     '        End If
 966             
 967                     '// 查找 空格
 968                     Case 32: webpage_analyze_flag_setbits flag_DanYinHao, flag_ShuangYinHao, flag_Space, iCount
 969             
 970                     '// 查找 enter
 971                     Case 13: webpage_analyze_flag_setbits flag_DanYinHao, flag_ShuangYinHao, flag_enter, iCount
 972             
 973                     '// 查找 changeline
 974                     Case 10: webpage_analyze_flag_setbits flag_DanYinHao, flag_ShuangYinHao, flag_changeline, iCount
 975             
 976                     '// 查找 tab
 977                     Case 9: webpage_analyze_flag_setbits flag_DanYinHao, flag_ShuangYinHao, flag_tab, iCount
 978             
 979                     '// 查找 '
 980                     Case 39
 981                         If flag_left_FangKuoHao > -1 Then
 982                             '// 敏感引号类型, 对于单双引号套双单引号的情况, 只对第1个引号类型敏感
 983                             If flag_ShuangYinHao = 0 Then
 984                             
 985                                 If flag_DanYinHao Then
 986                                     flag_DanYinHao = 0
 987                                 Else
 988                                     flag_DanYinHao = iCount
 989                                 End If
 990                             End If
 991                         End If
 992             
 993             
 994                     '// 查找 >
 995                     Case 62
 996                         
 997                         '// 遇到 > 时,表明标签结束
 998                         If flag_left_FangKuoHao > -1 Then
 999                             
1000                             '// 如果单双引号都是闭合状态
1001                             If (flag_ShuangYinHao = 0) And (flag_DanYinHao = 0) Then
1002                                 
1003             '                    '// 提取标签内容, 包括左括号, 但不包括右括号
1004             '                    ReDim mulBitsBuffer(iCount - flag_left_FangKuoHao)
1005             '
1006             '                    CopyMemory mulBitsBuffer(0), PG.ByteOfHtml(flag_left_FangKuoHao + 2), iCount - flag_left_FangKuoHao - 2
1007             '
1008             '                    '// 解析标签
1009             '                    webpage_analyze_tag (mulBitsBuffer)
1010                                 webpage_analyze_tag PG.ByteOfHtml, flag_left_FangKuoHao, iCount + 1
1011                                 
1012                                 webpage_analyze_flag_setbitszero flag_left_FangKuoHao, flag_DanYinHao, flag_ShuangYinHao, flag_Space, flag_enter, flag_changeline, flag_tab
1013             
1014                                 flag_left_FangKuoHao = -1
1015                                 
1016                             Else
1017                                 '// 单双引号非闭合状态, 符合:大于50个字符,且遇有空格/回车/tab/换行时,或超过256个字符时,发现 > 强制闭合
1018                                 If iCount - flag_left_FangKuoHao > 100 Then
1019             
1020                                     Select Case True
1021                                            Case ((iCount - flag_left_FangKuoHao) > 512), ((flag_Space Or flag_enter Or flag_changeline Or flag_tab) <> 0)
1022             
1023             '                                    '// 提取标签内容, 包括左括号, 但不包括右括号
1024             '                                    ReDim mulBitsBuffer(iCount - flag_left_FangKuoHao)
1025             '
1026             '                                    CopyMemory mulBitsBuffer(0), PG.ByteOfHtml(flag_left_FangKuoHao + 2), iCount - flag_left_FangKuoHao - 2
1027             '
1028             '                                    '// 解析标签
1029             '                                    webpage_analyze_tag (mulBitsBuffer)
1030                                                 
1031                                                 webpage_analyze_tag PG.ByteOfHtml, flag_left_FangKuoHao, iCount + 1
1032                                                 
1033                                                 
1034                                                 '// 标志位清0
1035                                                 webpage_analyze_flag_setbitszero flag_left_FangKuoHao, flag_DanYinHao, flag_ShuangYinHao, flag_Space, flag_enter, flag_changeline, flag_tab
1036                                                 flag_left_FangKuoHao = -1
1037                                                 
1038                                     End Select
1039                                     
1040                                 End If
1041                                 
1042             
1043                                 
1044                                 
1045                             End If
1046                             
1047                         Else
1048                             '// 未发现过 < ,单独发现闭合符号, 直接舍弃忽略, '// 标志位清0
1049                             webpage_analyze_flag_setbitszero flag_left_FangKuoHao, flag_DanYinHao, flag_ShuangYinHao, flag_Space, flag_enter, flag_changeline, flag_tab
1050                             flag_left_FangKuoHao = -1
1051                             
1052                         End If
1053                 
1054                 End Select
1055                 
1056             End If
1057         
1058         
1059 '        RaiseEvent AnalyzeProcess(iCount / PG.SizeOfHtml * 90)
1060 '        DoEvents
1061         
1062     Next
1063     
1064     
1065     
1066     
1067 End Function
1068 
1069 Private Sub Class_Initialize()
1070     
1071     Dim iCount      As Long
1072     
1073     '**************************************** 做一个类型表 ***********************************************
1074     
1075 '    <    "   space  enter   changeline tab  '   >
1076 '    60, 34,   32,    13,        10,     9, 39,  62
1077 '    PG.ByteSymbolTable(9) = 1
1078 '    PG.ByteSymbolTable(10) = 2
1079 '    PG.ByteSymbolTable(13) = 3
1080 '    PG.ByteSymbolTable(32) = 4
1081 '    PG.ByteSymbolTable(34) = 5
1082 '    PG.ByteSymbolTable(39) = 6
1083 '    PG.ByteSymbolTable(60) = 7
1084 '    PG.ByteSymbolTable(62) = 8
1085 
1086 
1087 
1088 
1089     
1090     '// 13, 10, 9 32 58,enter,changeline,tab,space,
1091     PG.ByteSymbolTable(9) = 1
1092     PG.ByteSymbolTable(10) = 1
1093     PG.ByteSymbolTable(13) = 1
1094     PG.ByteSymbolTable(32) = 1
1095     
1096     PG.ByteSymbolTable(62) = 1   '// >
1097     PG.ByteSymbolTable(58) = 1   '// 冒号
1098     PG.ByteSymbolTable(61) = 1   '// 等号
1099     
1100     
1101     
1102 '    pg.ByteSymbolTable(62) = 8     '// >
1103     
1104     
1105     '// 34/39    双引号/单引号
1106     PG.ByteSymbolTable(34) = 2
1107     PG.ByteSymbolTable(39) = 3
1108     
1109     
1110     
1111 '    '// /  闭合标签
1112     PG.ByteSymbolTable(47) = 4
1113     
1114     '// !  注释标签
1115     PG.ByteSymbolTable(33) = 5
1116     
1117     
1118     
1119     ' 字母/数字/-/_标记为5
1120     For iCount = 65 To 90: PG.ByteSymbolTable(iCount) = 6: Next         ' A-Z
1121     For iCount = 97 To 122: PG.ByteSymbolTable(iCount) = 6: Next        ' a-z
1122     PG.ByteSymbolTable(45) = 6                                     ' -
1123     PG.ByteSymbolTable(95) = 6                                     ' 下划线 _
1124 
1125     ' 数字
1126     For iCount = 48 To 57: PG.ByteSymbolTable(iCount) = 7: Next    ' 0-9
1127     
1128 End Sub
1129 
1130 Private Function webpage_analyze_skipcdata(webpageTag() As Byte, ByVal tagfrom As Long, ByVal htmlByteMax As Long) As Long
1131     
1132     '// <![CDATA[ ... ]]>
1133     
1134     Dim iCount          As Long
1135     
1136     Const STATE_CDATA_TANHAO_00             As Byte = 10     '// [
1137     
1138     Const STATE_CDATA_LEFTKUOHAO_01         As Byte = 1     '// [
1139     Const STATE_CDATA_C_02                  As Byte = 2     '// C
1140     Const STATE_CDATA_D_03                  As Byte = 3     '// D
1141     Const STATE_CDATA_A_04                  As Byte = 4     '// A
1142     Const STATE_CDATA_T_05                  As Byte = 5
1143     Const STATE_CDATA_A_06                  As Byte = 6
1144     Const STATE_CDATA_LEFTKUOHAO_07         As Byte = 7     '// [
1145     
1146     Const STATE_CDATA_RIGHTKUOHAO_08        As Byte = 8
1147     Const STATE_CDATA_RIGHTKUOHAO_09        As Byte = 9
1148     
1149     Dim stateValue      As Long
1150     
1151     For iCount = tagfrom + 2 To htmlByteMax Step 2
1152         
1153         If webpageTag(iCount + 1) = 0 Then
1154             
1155             Select Case webpageTag(iCount)
1156             
1157                    Case 91
1158                         If stateValue = STATE_CDATA_A_06 Then
1159                             stateValue = STATE_CDATA_LEFTKUOHAO_07
1160                             
1161                         ElseIf stateValue = STATE_CDATA_TANHAO_00 Then
1162                             stateValue = STATE_CDATA_LEFTKUOHAO_01
1163                         
1164                         ElseIf stateValue = STATE_CDATA_LEFTKUOHAO_07 Then
1165                             'stateValue = STATE_CDATA_LEFTKUOHAO_01
1166                             
1167                         Else
1168                             Exit Function
1169                         End If
1170                         
1171                    
1172                    Case 67, 99: If stateValue = STATE_CDATA_LEFTKUOHAO_01 Then stateValue = STATE_CDATA_C_02
1173                    Case 68, 100: If stateValue = STATE_CDATA_C_02 Then stateValue = STATE_CDATA_D_03 Else stateValue = STATE_CDATA_LEFTKUOHAO_07
1174                    Case 65, 97
1175                         If stateValue = STATE_CDATA_D_03 Then
1176                             stateValue = STATE_CDATA_A_04
1177                         ElseIf stateValue = STATE_CDATA_T_05 Then
1178                             stateValue = STATE_CDATA_A_06
1179                         Else
1180                             stateValue = STATE_CDATA_LEFTKUOHAO_07
1181                         End If
1182                         
1183                    Case 84, 116: If stateValue = STATE_CDATA_A_04 Then stateValue = STATE_CDATA_T_05 Else stateValue = STATE_CDATA_LEFTKUOHAO_07
1184                    Case 93
1185                         If stateValue = STATE_CDATA_LEFTKUOHAO_07 Then
1186 
1187                             stateValue = STATE_CDATA_RIGHTKUOHAO_08
1188 
1189                         ElseIf stateValue = STATE_CDATA_RIGHTKUOHAO_08 Then
1190 
1191                             stateValue = STATE_CDATA_RIGHTKUOHAO_09
1192                             
1193                         ElseIf stateValue = STATE_CDATA_RIGHTKUOHAO_09 Then
1194                             
1195                             stateValue = STATE_CDATA_RIGHTKUOHAO_09
1196                             
1197                         End If
1198                         
1199                    Case 62
1200                         If stateValue = STATE_CDATA_RIGHTKUOHAO_09 Then webpage_analyze_skipcdata = iCount + 1: Debug.Print Timer, Chr(webpageTag(iCount - 4)), Chr(webpageTag(iCount - 2)), Chr(webpageTag(iCount))
1201                         
1202                    Case 33
1203                         If stateValue = 0 Then stateValue = STATE_CDATA_TANHAO_00
1204                         
1205                    Case 32, 9, 13, 10
1206                    
1207                    Case Else
1208                    
1209                         If (stateValue = STATE_CDATA_RIGHTKUOHAO_08) Or (stateValue = STATE_CDATA_RIGHTKUOHAO_09) Then
1210                             stateValue = STATE_CDATA_LEFTKUOHAO_07
1211                         
1212                         ElseIf stateValue <> STATE_CDATA_LEFTKUOHAO_07 Then
1213                             Exit Function
1214                             
1215                         End If
1216                         
1217             End Select
1218                         
1219         End If
1220         
1221     Next
1222     
1223     
1224     
1225 End Function

 

posted on 2025-09-29 22:58  PctGL  阅读(8)  评论(0)    收藏  举报