very basic web crawler on golang - Site Reliability Engineer Blog

Hi guys,

this is day 6 out 100 days of code on go lang.

Following code implement recursive fetching of internal links on given web page.

What I want to do next is to add goroutines in action, because fetching process is nice to have run in parallel. Any suggestion how to do it?

package main

//todo:
// - every link must be visited only once [done]
// - keep a map of visited links [done]
// - fix links and page concatination [done]
// - extract domain from request uri to simplify crawling [done]
// - rework how internal links are selected for crawling
// - fix how crawling settings are set like depth and maxLinks

import (
	"flag"
	"fmt"
	"io"
	"log"
	"net/http"
	"time"

	"golang.org/x/net/html"
)

//hash of visited links to prevent double visit
var visitedLinks map[string]bool
var baseURL = flag.String("url", "", "start url")

func main() {
	visitedLinks = make(map[string]bool)
	flag.Parse()

	if *baseURL == "" {
		log.Fatal("--url paramters is required")
	}

	visitedLinks[*baseURL] = false

	//set parameters for crawling
	crawl("/")
}

func crawl(link string) {
	//check if link already visited
	if visitedLinks[link] {
		return
	}
	//set link as visited
	visitedLinks[link] = true
	fmt.Printf("Crawling %s ..................\n\n", *baseURL+link)
	resp, err := http.Get(*baseURL + link)
	if err != nil {
		log.Fatal(err)
	}
	defer resp.Body.Close()

	linkCounter := 0
	for _, href := range getLinks(resp.Body) {
		//todo: rework how links are selected
		if len(href) > 0 && string(href[0]) == "/" && // only internal links
			href != link { //skip current page
			if len(href) > 1 && href[1] == '/' { //skip external links which start with //
				continue
			}
			linkCounter++
			//fmt.Printf("Found: %s\n", href)
			crawl(href)
			time.Sleep(time.Second * 1)
		}
	}
}

//Collect all links from response body and return it as an array of strings
func getLinks(body io.Reader) []string {
	var links []string
	z := html.NewTokenizer(body)
	for {
		tt := z.Next()

		switch tt {
		case html.ErrorToken:
			return links
		case html.StartTagToken, html.EndTagToken:
			token := z.Token()
			if "a" == token.Data {
				for _, attr := range token.Attr {
					if attr.Key == "href" {
						links = append(links, attr.Val)
					}

				}
			}

		}
	}
}

Source code on github