'提取试卷优化
Private Declare Function URLDownloadToFile Lib "urlmon" Alias "URLDownloadToFileA" (ByVal pCaller As Long, ByVal szURL As String, ByVal szFileName As String, ByVal dwReserved As Long, ByVal lpfnCB As Long) As Long
Private Declare Function DeleteUrlCacheEntry Lib "wininet" Alias "DeleteUrlCacheEntryA" (ByVal lpszUrlName As String) As Long
Sub DownloadImageName(ByVal ImageURL As String, ByVal ImagePath As String)
Dim lngRetVal As Long
lngRetVal = URLDownloadToFile(0, ImageURL, ImagePath, 0, 0)
If lngRetVal = 0 Then
DeleteUrlCacheEntry ImageURL '清除缓存
'MsgBox "成功"
Else
'MsgBox "失败"
End If
End Sub
Sub LoopDownloadExam()
Dim Wb As Workbook
Dim Sht As Worksheet
Set Wb = Application.ThisWorkbook
Set Sht = Wb.ActiveSheet
With Sht
EndRow = .Cells(.Cells.Rows.Count, 2).End(xlUp).Row
For i = 2 To EndRow
If .Cells(i, 2).Text Like "http*" Then
NewGetEaxmContent .Cells(i, 2).Text
End If
Next i
End With
Set Wb = Nothing
Set Sht = Nothing
End Sub
Sub DownloadExam()
Dim Rng As Range
Set Rng = Application.ActiveCell
If Rng.Text Like "http*" Then
NewGetEaxmContent Rng.Text
End If
Set Rng = Nothing
End Sub
Sub NewGetEaxmContent(ByVal Url As String)
Dim ContentCode As String
Dim dPos As Object
Set dPos = CreateObject("Scripting.Dictionary")
'send request
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", Url, False
.Send
WebText = .responsetext
'Debug.Print WebText
' Stop
End With
With CreateObject("htmlfile")
.write WebText
Set examdiv = .getElementById("sina_keyword_ad_area2")
Title = Replace(.getElementsByTagName("title")(0).innerText, "新浪博客", "")
docPath = ThisWorkbook.Path & "\" & Title & ".doc"
If Dir(docPath) <> "" Then
MsgBox "该份试卷已经存在!"
GoTo ErrorExit
End If
'Debug.Print Title
ContentCode = Split(WebText, "sina_keyword_ad_area2")(1)
ContentCode = Split(ContentCode, "正文结束")(0)
ContentCode = Replace(ContentCode, Title, "")
ContentCode = Replace(ContentCode, "宋体", "")
ContentCode = Replace(ContentCode, "楷体", "")
'Debug.Print ContentCode
'http://s15.sinaimg.cn/mw690/001Eip7Fzy7iGylGjfg3e&690
'http://s15.sinaimg.cn/mw690/001Eip7Fzy7iGylGjfg3e&690
Open ThisWorkbook.Path & "\html.txt" For Output As #1 '生成CSV文件
Print #1, ContentCode '写入CSV的内容
Close #1 '关闭文件句柄
'获取试卷文本内容
ExamText = examdiv.innerText
'For Each oneP In examdiv.getElementsByTagName("p")
'Debug.Print oneP.innerText
'Next oneP
imgIndex = 0
For Each oneimg In examdiv.getElementsByTagName("img")
imgIndex = imgIndex + 1
imgUrl = oneimg.real_src
imgPath = ThisWorkbook.Path & "\" & imgIndex & ".jpg"
DownloadImageName imgUrl, imgPath
sp = Split(imgUrl, "&")(0)
Debug.Print sp
Debug.Print InStr(ContentCode, sp)
cnt = Split(ContentCode, sp)(1)
spos = RegGet(cnt, "([\u4e00-\u9fa5]{5,})")
dPos(spos) = imgPath
Debug.Print spos
Next oneimg
'输出题目内容到Word文档
Dim wdApp As Object
Dim Doc As Object
On Error Resume Next
Set wdApp = GetObject(, "Word.Application")
On Error GoTo 0
If Not wdApp Is Nothing Then
wdApp.Visible = True
On Error Resume Next
Set Doc = wdApp.Documents(docName)
On Error GoTo 0
If Doc Is Nothing Then
Set Doc = wdApp.Documents.Add()
Doc.SaveAs docPath
End If
Else
Set wdApp = CreateObject("Word.Application")
wdApp.Visible = True
If Dir(docPath) <> "" Then
Set Doc = wdApp.Documents.Open(docPath)
Else
Set Doc = wdApp.Documents.Add()
Doc.SaveAs docPath
End If
End If
Doc.Activate
wdApp.Selection.homekey 6
For Each oneP In examdiv.getElementsByTagName("p")
pText = oneP.innerText
For Each oneimg In dPos.keys
If InStr(pText, oneimg) > 0 Then
ImagePath = dPos(oneimg)
'插入图片
wdApp.Selection.InlineShapes.AddPicture Filename:=ImagePath, LinkToFile:=False, SaveWithDocument:=True
wdApp.Selection.TypeParagraph
On Error Resume Next
Kill ImagePath
On Error GoTo 0
Exit For
End If
Next oneimg
wdApp.Selection.Typetext pText
wdApp.Selection.TypeParagraph
'Debug.Print oneP.innerText
Next oneP
Doc.Save
Doc.Close True
wdApp.Quit
End With
ErrorExit:
Set dPos = Nothing
Set wdApp = Nothing
Set Doc = Nothing
End Sub
Private Function RegTest(ByVal OrgText As String, ByVal Pattern As String) As Boolean
'传递参数 :原字符串, 匹配模式
Dim Regex As Object
Set Regex = CreateObject("VBScript.RegExp")
With Regex
.Global = True
.Pattern = Pattern
End With
RegTest = Regex.test(OrgText)
Set Regex = Nothing
End Function
Private Function RegGet(ByVal OrgText As String, ByVal Pattern As String) As String
'传递参数 :原字符串, 匹配模式
Dim Regex As Object
Dim Mh As Object
Set Regex = CreateObject("VBScript.RegExp")
With Regex
.Global = True
.Pattern = Pattern
End With
If Regex.test(OrgText) Then
Set Mh = Regex.Execute(OrgText)
RegGet = Mh.Item(0).submatches(0)
Else
RegGet = ""
End If
Set Regex = Nothing
End Function
Private Function RegReplace(ByVal OrgText As String, ByVal Pattern As String, Optional RepStr As String = "") As String
'传递参数 :原字符串, 匹配模式 ,替换字符
Dim Regex As Object
Dim newText As String
Set Regex = CreateObject("VBScript.RegExp")
With Regex
.Global = True
.Pattern = Pattern
End With
newText = Regex.Replace(OrgText, RepStr)
RegReplace = newText
Set Regex = Nothing
End Function
Private Function RegGetArray(ByVal OrgText As String, ByVal Pattern As String) As String()
Dim Reg As Object, Mh As Object, OneMh As Object
Dim Arr() As String, Index As Long
Dim Elm As String
Set Reg = CreateObject("Vbscript.Regexp")
With Reg
.MultiLine = True
.Global = True
.Ignorecase = False
.Pattern = Pattern
Set Mh = .Execute(OrgText)
Index = 0
ReDim Arr(1 To 1)
For Each OneMh In Mh
Index = Index + 1
ReDim Preserve Arr(1 To Index)
'If OneMh.submatches(0) <> "" Then Elm = OneMh.submatches(0)
Arr(Index) = OneMh.submatches(0)
'Debug.Print OneMh.submatches(0)
Next OneMh
End With
RegGetArray = Arr
Set Reg = Nothing
Set Mh = Nothing
End Function
Private Function RegGetLast(ByVal OrgText As String, ByVal Pattern As String) As String
'传递参数 :原字符串, 匹配模式
Dim Regex As Object
Dim Mh As Object
Set Regex = CreateObject("VBScript.RegExp")
With Regex
.Global = True
.Pattern = Pattern
End With
If Regex.test(OrgText) Then
Set Mh = Regex.Execute(OrgText)
'RegGetLast = Mh.Item(0).submatches(0)
For Each OneMh In Mh
RegGetLast = OneMh.submatches(0)
Next OneMh
Else
RegGetLast = ""
End If
Set Regex = Nothing
End Function