现在网上的采集程序很多,但是有时候你发现一个好的网站,想自己做个采集工具采集一些信息,就需要自己去写程序了,其实这样的采集程序并不难写,主要是去分析源网站的网页结构。首先去下载个XMLHTTP的类文件:<%Class xhttpprivate cset,sUrl,sErrorPrivate Sub Class_Initialize()'cset="UTF-8"cset="GB2312"sError=""end sub
Private Sub Class_Terminate()End Sub
Public Property LET URL(theurl)sUrl=theurlend propertypublic property GET BasePath()BasePath=mid(sUrl,1,InStrRev(sUrl,"/")-1)end propertypublic property GET FileName()FileName=mid(sUrl,InStrRev(sUrl,"/")+1)end propertypublic property GET Html()Html=BytesToBstr(getBody(sUrl))end property
public property GET xhttpError()xhttpError=sErrorend property
private Function BytesToBstr(body)on error resume next'Cset:GB2312 UTF-8dim objstream set objstream = Server.CreateObject("adodb.stream") with objstream.Type = 1 '.Mode = 3 '.Open .Write body '.Position = 0 '.Type = 2 '.Charset = Cset 'BytesToBstr = .ReadText '.Close end withset objstream = nothing End Function
private function getBody(surl)on error resume nextdim xmlHttp'Set xmlHttp=server.createobject("Msxml2.XMLHTTP.4.0")'set xmlHttp=server.createobject("Microsoft.XMLHTTP")set xmlHttp=server.createobject("MSXML2.ServerXMLHTTP")xmlHttp.setTimeouts 10000,10000,10000,30000xmlHttp.open "GET",surl,falsexmlHttp.sendif xmlHttp.readystate=4 then 'if xmlHttp.status=200 then getBody=xmlhttp.responsebody'end if else getBody=""end if
if Err.Number<>0 then sError=Err.NumberErr.clearelsesError=""end ifset xmlHttp=nothingend function
Public function saveimage(tofile,isoverwrite)on error resume nextdim objStream,objFSO,imgs
if Not isoverwrite Then Set objFSO = Server.CreateObject("Scripting.FileSystemObject") If objFSO.FileExists(Server.MapPath(tofile)) Then Exit Function End If Set objFSO = NothingEnd IF
imgs=getBody(sUrl)Set objStream = Server.CreateObject("ADODB.Stream")with objStream.Type =1.Open.write imgs.SaveToFile server.mappath(tofile),2.Close()end withset objstream=nothingend function
end class
%>用了这个类文件,做起事情来就方便多了。然后就可以分析采集网站的网页结构,写采集程序了。下面给个例子:<!--#include file="conn.asp"--><!--#include file="inc/xhttp_class.asp"--><!--#include file="inc/function.asp"--><%server.ScriptTimeout = 1000%><html><head><meta http-equiv="Content-Type" content="text/html; charset=gb2312" /><title>BT采集器</title></head><body><form name="form1" method="post" action="get81bt.asp">分类ID: <input type="text" name="cid" value="<%=request("cid")%>"><br>开始ID: <input type="text" name="startid" value="<%=request("startid")%>"> <br> 结束ID: <input type="text" name="overid" value="<%=request("overid")%>"> <br> 分类名称:<input type="text" name="classname" value="<%=request("classname")%>">为空自动获取 <br> <input name="action" type="hidden" id="action" value="getdata"> <input type="submit" name="Submit" value="采集"></form>当前ID:<%=request("id")%> <br><%dim actionaction = Request("action")if action = "getdata" then cid = Request("cid") startid = Request("startid") overid = Request("overid") id = Request("id") if id = "" then id = startid set objxhttp = new xhttp objxhttp.URL = "http://www.81dd.com/Class/"&cid&"_"&id&".htm" content = objxhttp.Html if InStr(content,"网站维护中") then call NextID response.End() end if list = GetContent(content,"<!--内容开始-->","<!--内容结束-->",0) Dim regEx, Match, Matches,patrn Set regEx = New RegExp patrn = "<a href=""../BtHtml/(.+?)"">" regEx.Pattern = patrn regEx.IgnoreCase = True regEx.Global = True Set Matches = regEx.Execute(list) on error resume next For Each Match in Matches 'response.write Match.Value & "<br>" weburl = "http://www.81dd.com/BtHtml/" & regEx.Replace(Match.Value,"$1") response.write weburl & "<br>" response.Flush() objxhttp.URL = weburl cpage = objxhttp.Html cpage = GetContent(cpage,"<!--内容开始-->","<!--内容结束-->",0) title = GetContent(cpage,"BT资源名称:<strong>","</strong>",0) title = stripHTML(title) IF Request("classname") <> "" then classname = Request("classname") Else if InStr(title,"喜剧") then classname = "喜剧" Elseif InStr(title,"动作") then classname = "动作" Elseif InStr(title,"惊悚") then classname = "惊悚" Elseif InStr(title,"犯罪") then classname = "犯罪" Elseif InStr(title,"恐怖") then classname = "恐怖" Elseif InStr(title,"爱情") then classname = "爱情" Elseif InStr(title,"冒险") then classname = "冒险" Elseif InStr(title,"科幻") then classname = "科幻" Elseif InStr(title,"悬念") then classname = "悬念" Elseif InStr(title,"奇幻") then classname = "奇幻" Elseif InStr(title,"战争") then classname = "战争" Elseif InStr(title,"连续剧") then classname = "连续剧" Elseif InStr(title,"综艺") then classname = "综艺" Elseif InStr(title,"灾难") then classname = "灾难" Elseif InStr(title,"伦理") then classname = "伦理" Elseif InStr(title,"动漫") or InStr(title,"动画") then classname = "动漫" Elseif InStr(title,"国语") or InStr(title,"集") then classname = "其他影视" Else classname = "其他" End if End IF intro = GetContent(cpage,"<tr><td width=770 bgcolor=#FFFFFF><div style=""margin:10px;line-height:150%"">","</div>",0) intro = Replace(intro,"<br />","[br]") intro = Replace(intro,"<BR />","[br]") intro = Replace(intro,"<BR>","[br]") intro = Replace(intro,"<br>","[br]") intro = Replace(intro,"<p>","[p]") intro = Replace(intro,"<P>","[p]") intro = Replace(intro,"</p>","[/p]") intro = Replace(intro,"</P>","[p]") intro = Replace(intro,"<img","[img") intro = Replace(intro,"<IMG","[img") intro = stripHTML(intro) intro = Replace(intro,"[br]","<br>") intro = Replace(intro,"[p]","<p>") intro = Replace(intro,"[/p]","</p>") intro = Replace(intro,"[img","<img") intro = Replace(intro,"[img]","<img src=") intro = Replace(intro,"[/img]",">") intro = Replace(intro,"[IMG]","<img src=") intro = Replace(intro,"[/IMG]",">") 'response.write t 'response.End() addtime = Trim(GetContent(cpage,"发布时间:"," ",0)) if Not IsDate(addtime) then addtime = now() username = "bt" filesize = GetContent(content,"BT文件大小:"," ",0) title2 = title downurl = GetContent(cpage,"<a style=""color:red"" href=""","""",0) p = CDate(addtime) Dim sRnd Randomize sRnd = Int(900 * Rnd) + 100 sFileName = year(p) & month(p) & day(p) & hour(now) & minute(now) & second(now) & sRnd & ".torrent" url = "torrent/" & year(p) & "-" & month(p) & "-" & day(p) & "/" & sFileName Call CreateF(url) 'Text Response.Write classname & "<br>" Response.write title & "<br>" 'response.Write intro & "<br>" 'response.Write addtime & "<br>" 'response.Write username & "<br>" 'response.Write filesize & "<br>" response.Write downurl & "<br>" response.Write url & "<br>" response.Flush() 'response.End() 'database if err.number = 0 then if (Not IsNull(title)) and title <> "" and downurl <> "" then set rs = server.CreateObject("adodb.recordset") sql = "select * from bt_class where classname = '" & classname & "'" rs.open sql,conn,1,3 if rs.eof then rs.addnew rs("classname") = classname rs.update end if classid = rs("classid") rs.close set rs = nothing set rs = server.CreateObject("adodb.recordset") sql = "select * from bt_movie where title in ('" & title & "')" rs.open sql,conn,1,3 if rs.eof then response.Write "<div><font color=blue>写入数据库...</font></div>" response.Flush() rs.addnew rs("classid") = classid rs("title") = title rs("title2") = title2 rs("intro") = intro rs("username") = username rs("filesize") = filesize rs("url") = url rs("serverid") = 1 rs("addtime") = addtime rs("ismake") = 0 rs.update objxhttp.URL = downurl objxhttp.saveimage url,False else response.Write "<div><font color=red>已经存在!</font></div>" end if rs.close set rs = nothing 'objxhttp.URL = downurl 'objxhttp.saveimage url,False End IF Else err.clear End IF response.Write "-------------------------------------------<br>" Next set regEx = nothing response.Write "下一页<br>" response.Flush() Call NextID()end ifSub NextID conn.close set conn = nothing if cint(startid) < cint(overid) and cint(id) < cint(overid) then response.Write "<script>location.href='get81bt.asp?action=getdata&classname=" & Request("classname") & "&cid=" & cid & "&startid=" & startid & "&overid=" & overid & "&id="& id + 1 &"'</script>" Elseif cint(startid) > cint(overid) and cint(id) > cint(overid) then response.Write "<script>location.href='get81bt.asp?action=getdata&classname=" & Request("classname") & "&cid=" & cid & "&startid=" & startid & "&overid=" & overid & "&id="& id - 1 &"'</script>" Else Response.Write "采集完成!<br>" response.End() End ifEnd Sub%></body></html>
推荐阅读
百度小偷
<%Function bytes2BSTR(vIn)strReturn = ""For i = 1 To LenB(vIn)ThisCharCode = AscB(MidB(vIn,i,1))If ThisCharCode < &H80 ThenstrReturn = strReturn & Chr(ThisCharCode)ElseNextCharCode = AscB(MidB(vIn,i>>>详细阅读
本文标题:自己做采集程序
地址:http://www.17bianji.com/kaifa2/ASP/33280.html
1/2 1