golang使用正则表达式解析网页
废话少说,直接奉上代码:
packagemain
import(
"fmt"
"time"
"io/ioutil"
"net/http"
"regexp"
"strings"
)
funcmain(){
ip_pool:=[]string{
"172.16.1.128",
"172.16.1.129",
"172.16.1.131",
"172.16.1.132",
"172.16.1.133",
"172.16.1.134",
"172.16.1.135",
"172.16.1.136",
"172.16.1.137",
"172.16.1.138",
"172.16.1.190",
}
for{
fori:=0;i<len(ip_pool);i++ {
url:="http://"+ip_pool[i]+":10022"
//fmt.Println("-----------------",ip_pool[i],"---------")
get_url(url)
time.Sleep(1*time.Millisecond)
}
//time.Sleep(time.Second*60)
}
}
funcget_url(urlstring){
fmt.Println("----------",url,"----------------")
resp,err:=http.Get(url)
iferr!=nil{
fmt.Println("httpgeterror.")
}
deferresp.Body.Close()
body,err:=ioutil.ReadAll(resp.Body)
iferr!=nil{
fmt.Println("httpreaderror")
}
src:=string(body)
//将HTML标签全转换成小写
re,_:=regexp.Compile("\\<[\\S\\s]+?\\>")
src=re.ReplaceAllStringFunc(src,strings.ToLower)
// 提取table标签
re,_=regexp.Compile("\\<!doc[\\S\\s]+?\\<table")
src=re.ReplaceAllString(src,"<table")
re,_=regexp.Compile("</table\\>[\\S\\s]+?\\</html\\>")
src=re.ReplaceAllString(src,"</table>")
//干掉行首的空格
re,_=regexp.Compile("(\\<tr>)([\\S\\s\\<>\"\\d]+?)(\\</tr>)")
//src=re.ReplaceAllString(src,"$2")
src=re.ReplaceAllString(src,"$2]")
//去掉<>标签
re,_=regexp.Compile("<[\\S\\s]+?>")
src=re.ReplaceAllString(src,"")
re,_=regexp.Compile("\n")
src=re.ReplaceAllString(src,"")
re,_=regexp.Compile("[]+")
src=re.ReplaceAllString(src,"")
re,_=regexp.Compile("]")
src=re.ReplaceAllString(src,"\n")
//变成json 1 2 3 4
re,_=regexp.Compile("(\\w*)(\\w{2}:\\w{2}:\\w{2}:\\w{2}:\\w{2}:\\w{2})([A-Za-z]*)(\\d{4}-\\d{2}-\\d{2}\\d{2}:\\d{2}:\\d{2})([V\\d\\.]*)(\\d{4}-\\d{2}-\\d{2}(\\d{2}:\\d{2}(:\\d{2})?)?)")
/*(((\\d{4}-\\d{2}-\\d{2})+(\\d{2}:\\d{2}:\\d{2})*?)")
*/
src=re.ReplaceAllString(src,"$1,$2,$3,$4,$5,$6,")
//re,_=regexp.Compile("(<[\\S\\s]*?\">)([\\S\\s]+?)(</t")
//src=re.ReplaceAllString(src,"$2,")
//re,_=regexp.Compile("<[\\S\\s]+?>")
//src=re.ReplaceAllString(src,"")
//reg:=regexp.MustCompile("([A-Za-z]+?)(\n[\\s]+)([\\d]+)")
//src= reg.ReplaceAllString(src,"$1:$3")
//去除连续的换行符
//re,_=regexp.Compile("\\s{2,}")
//src=re.ReplaceAllString(src,"\n")
//re=regexp.MustCompile("\n\\d+")
//fmt.Println(re.ReplaceAllLiteralString("hello\n2\nwork",'\d'))
src=strings.Replace(src,"虚拟机名称虚拟机MAC虚拟机状态心跳时间引擎版本病毒库日期扫描样本数","vm_name,vm_mac,vm_state,vm_heart,vm_eg,vm_av_db,vm_count",-1)
fmt.Println(src)
//reg,err:=regexp.Compile("[[0-9A-Za-z]{2}:?]{6}")
//fmt.Printf("%q,%v\n",reg.FindString("00:16:3e:4a:29:35"),err)
//"Hello",
//text:="Hello\n123\nGo\n123"
//reg=regexp.MustCompile("([A-Za-z]+?)(\n)([\\d]+)")
//fmt.Printf("%q\n",reg.ReplaceAllString(text,"$3:$1"))
//fmt.Println(strings.TrimSpace(src))
//去除STYLE
//re,_=regexp.Compile("\\<style[\\S\\s]+?\\</style\\>")
//src=re.ReplaceAllString(src,"")
//去除SCRIPT
//re,_=regexp.Compile("\\<script[\\S\\s]+?\\</script\\>")
//src=re.ReplaceAllString(src,"")
//去除所有尖括号内的HTML代码,并换成换行符
//re,_=regexp.Compile("\\<[\\S\\s]+?\\>")
//src=re.ReplaceAllString(src,"\n")
//去除连续的换行符
//re,_=regexp.Compile("\\s{2,}")
//src=re.ReplaceAllString(src,"\n")
//fmt.Println(strings.TrimSpace(src))
}
以上就是本文给大家分享的代码了,希望大家能够喜欢。