Hi guys,
this is day 6 out 100 days of code on go lang.
Following code implement recursive fetching of internal links on given web page.
What I want to do next is to add goroutines in action, because fetching process is nice to have run in parallel. Any suggestion how to do it?
package main
//todo:
// - every link must be visited only once [done]
// - keep a map of visited links [done]
// - fix links and page concatination [done]
// - extract domain from request uri to simplify crawling [done]
// - rework how internal links are selected for crawling
// - fix how crawling settings are set like depth and maxLinks
import (
"flag"
"fmt"
"io"
"log"
"net/http"
"time"
"golang.org/x/net/html"
)
//hash of visited links to prevent double visit
var visitedLinks map[string]bool
var baseURL = flag.String("url", "", "start url")
func main() {
visitedLinks = make(map[string]bool)
flag.Parse()
if *baseURL == "" {
log.Fatal("--url paramters is required")
}
visitedLinks[*baseURL] = false
//set parameters for crawling
crawl("/")
}
func crawl(link string) {
//check if link already visited
if visitedLinks[link] {
return
}
//set link as visited
visitedLinks[link] = true
fmt.Printf("Crawling %s ..................\n\n", *baseURL+link)
resp, err := http.Get(*baseURL + link)
if err != nil {
log.Fatal(err)
}
defer resp.Body.Close()
linkCounter := 0
for _, href := range getLinks(resp.Body) {
//todo: rework how links are selected
if len(href) > 0 && string(href[0]) == "/" && // only internal links
href != link { //skip current page
if len(href) > 1 && href[1] == '/' { //skip external links which start with //
continue
}
linkCounter++
//fmt.Printf("Found: %s\n", href)
crawl(href)
time.Sleep(time.Second * 1)
}
}
}
//Collect all links from response body and return it as an array of strings
func getLinks(body io.Reader) []string {
var links []string
z := html.NewTokenizer(body)
for {
tt := z.Next()
switch tt {
case html.ErrorToken:
return links
case html.StartTagToken, html.EndTagToken:
token := z.Token()
if "a" == token.Data {
for _, attr := range token.Attr {
if attr.Key == "href" {
links = append(links, attr.Val)
}
}
}
}
}
}