当前位置：首页 > news >正文

用Go语言正则，如何爬取数据

news 来源：原创 2025/4/26 19:14:34

文章精选推荐

1 JetBrains Ai assistant 编程工具让你的工作效率翻倍
2 Extra Icons：JetBrains IDE的图标增强神器
3 IDEA插件推荐-SequenceDiagram，自动生成时序图
4 BashSupport Pro 这个ides插件主要是用来干嘛的？
5 IDEA必装的插件：Spring Boot Helper的使用与功能特点
6 Ai assistant ,又是一个写代码神器
7 Cursor 设备ID修改器，你的Cursor又可以继续试用了

文章正文

Go语言结合正则表达式可以构建高效的数据爬取工具。下面我将提供几个完整的实例，涵盖不同场景下的数据爬取需求。

基础网页内容爬取

1.1 获取网页中所有链接

package mainimport ("fmt""io/ioutil""net/http""regexp"
)func main() {// 发送HTTP请求resp, err := http.Get("https://example.com")if err != nil {fmt.Println("HTTP请求失败:", err)return}defer resp.Body.Close()// 读取响应内容body, err := ioutil.ReadAll(resp.Body)if err != nil {fmt.Println("读取响应失败:", err)return}// 编译正则表达式，匹配所有a标签的href属性re := regexp.MustCompile(`<a[^>]+href=["'](.*?)["']`)matches := re.FindAllStringSubmatch(string(body), -1)// 输出所有链接fmt.Println("找到的链接:")for _, match := range matches {if len(match) > 1 {fmt.Println(match[1])}}
}

1.2 提取特定模式的文本

package mainimport ("fmt""io/ioutil""net/http""regexp"
)func main() {resp, err := http.Get("https://example.com")if err != nil {fmt.Println("HTTP请求失败:", err)return}defer resp.Body.Close()body, _ := ioutil.ReadAll(resp.Body)// 匹配所有<h1>-<h6>标签内容re := regexp.MustCompile(`<h[1-6][^>]*>(.*?)</h[1-6]>`)titles := re.FindAllStringSubmatch(string(body), -1)fmt.Println("网页标题:")for _, title := range titles {if len(title) > 1 {// 去除HTML标签cleanTitle := regexp.MustCompile(`<[^>]+>`).ReplaceAllString(title[1], "")fmt.Println(cleanTitle)}}
}

结构化数据爬取

2.1 爬取表格数据

package mainimport ("fmt""io/ioutil""net/http""regexp""strings"
)func main() {resp, err := http.Get("https://example.com/table-page")if err != nil {fmt.Println("HTTP请求失败:", err)return}defer resp.Body.Close()body, _ := ioutil.ReadAll(resp.Body)content := string(body)// 匹配整个表格tableRe := regexp.MustCompile(`<table[^>]*>(.*?)</table>`)tableMatch := tableRe.FindStringSubmatch(content)if len(tableMatch) == 0 {fmt.Println("未找到表格")return}tableContent := tableMatch[1]// 匹配表格行rowRe := regexp.MustCompile(`<tr[^>]*>(.*?)</tr>`)rows := rowRe.FindAllStringSubmatch(tableContent, -1)// 匹配单元格cellRe := regexp.MustCompile(`<t[dh][^>]*>(.*?)</t[dh]>`)fmt.Println("表格数据:")for _, row := range rows {cells := cellRe.FindAllStringSubmatch(row[1], -1)for _, cell := range cells {if len(cell) > 1 {// 清理单元格内容cleanCell := strings.TrimSpace(regexp.MustCompile(`<[^>]+>`).ReplaceAllString(cell[1], ""))fmt.Printf("%s\t", cleanCell)}}fmt.Println() // 换行}
}

2.2 爬取JSON数据中的特定字段

package mainimport ("encoding/json""fmt""io/ioutil""net/http""regexp"
)type Product struct {Name  string  `json:"name"`Price float64 `json:"price"`
}func main() {resp, err := http.Get("https://api.example.com/products")if err != nil {fmt.Println("HTTP请求失败:", err)return}defer resp.Body.Close()body, _ := ioutil.ReadAll(resp.Body)// 方法1：直接解析JSONvar products []Productif err := json.Unmarshal(body, &products); err == nil {fmt.Println("产品列表(JSON解析):")for _, p := range products {fmt.Printf("%s - $%.2f\n", p.Name, p.Price)}return}// 方法2：当JSON结构不确定时使用正则fmt.Println("\n尝试使用正则表达式提取:")// 匹配产品名称和价格re := regexp.MustCompile(`"name"\s*:\s*"([^"]+)"[^}]+"price"\s*:\s*(\d+\.?\d*)`)matches := re.FindAllStringSubmatch(string(body), -1)for _, match := range matches {if len(match) >= 3 {fmt.Printf("%s - $%s\n", match[1], match[2])}}
}

高级爬虫技巧

3.1 带并发控制的爬虫

package mainimport ("fmt""io/ioutil""net/http""regexp""sync"
)func main() {urls := []string{"https://example.com/page1","https://example.com/page2","https://example.com/page3",}var wg sync.WaitGroupsemaphore := make(chan struct{}, 3) // 并发限制为3titleRe := regexp.MustCompile(`<title[^>]*>(.*?)</title>`)for _, url := range urls {wg.Add(1)go func(u string) {defer wg.Done()semaphore <- struct{}{} // 获取信号量resp, err := http.Get(u)if err != nil {fmt.Printf("获取 %s 失败: %v\n", u, err)<-semaphorereturn}body, _ := ioutil.ReadAll(resp.Body)resp.Body.Close()title := titleRe.FindStringSubmatch(string(body))if len(title) > 1 {fmt.Printf("%s 的标题: %s\n", u, title[1])}<-semaphore // 释放信号量}(url)}wg.Wait()
}

3.2 处理分页内容

package mainimport ("fmt""io/ioutil""net/http""regexp""strconv"
)func main() {baseURL := "https://example.com/news?page="pageRe := regexp.MustCompile(`<div class="news-item">(.*?)</div>`)titleRe := regexp.MustCompile(`<h2>(.*?)</h2>`)pageNumRe := regexp.MustCompile(`page=(\d+)`)// 先获取总页数totalPages := getTotalPages(baseURL + "1")fmt.Printf("共发现 %d 页内容\n", totalPages)// 爬取每页内容for page := 1; page <= totalPages; page++ {url := baseURL + strconv.Itoa(page)fmt.Printf("\n正在爬取第 %d 页: %s\n", page, url)resp, err := http.Get(url)if err != nil {fmt.Printf("获取第 %d 页失败: %v\n", page, err)continue}body, _ := ioutil.ReadAll(resp.Body)resp.Body.Close()newsItems := pageRe.FindAllStringSubmatch(string(body), -1)for _, item := range newsItems {if len(item) > 1 {title := titleRe.FindStringSubmatch(item[1])if len(title) > 1 {fmt.Println("新闻标题:", title[1])}}}}
}func getTotalPages(url string) int {resp, err := http.Get(url)if err != nil {return 1 // 默认1页}defer resp.Body.Close()body, _ := ioutil.ReadAll(resp.Body)// 假设页面中有类似 "共 5 页" 的文字re := regexp.MustCompile(`共\s*(\d+)\s*页`)match := re.FindStringSubmatch(string(body))if len(match) > 1 {total, _ := strconv.Atoi(match[1])return total}return 1
}

实用技巧与注意事项

User-Agent设置：

client := &http.Client{}
req, _ := http.NewRequest("GET", "https://example.com", nil)
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; MyBot/1.0)")
resp, _ := client.Do(req)

处理相对链接：

import "net/url"base, _ := url.Parse("https://example.com")
rel, _ := url.Parse("/page1")
absURL := base.ResolveReference(rel).String()

正则表达式优化：

预编译正则表达式：re := regexp.MustCompile(pattern)
使用非贪婪匹配：.*?
避免过度复杂的正则表达式

错误处理增强：

resp, err := http.Get(url)
if err != nil {return fmt.Errorf("请求失败: %w", err)
}
defer func() {if err := resp.Body.Close(); err != nil {log.Printf("关闭响应体失败: %v", err)}
}()

反爬虫策略应对

设置合理的请求间隔：

import "time"func crawlWithDelay(urls []string, delay time.Duration) {for _, url := range urls {go crawlPage(url)time.Sleep(delay)}
}

使用代理IP：

proxyUrl, _ := url.Parse("http://proxy-ip:port")
client := &http.Client{Transport: &http.Transport{Proxy: http.ProxyURL(proxyUrl),},
}
resp, _ := client.Get("https://example.com")

处理Cookies：

jar, _ := cookiejar.New(nil)
client := &http.Client{Jar: jar}
// 第一次请求获取cookie
client.Get("https://example.com/login")
// 后续请求会携带cookie
client.Get("https://example.com/protected-page")