Implement content sniffing for HTML parsing

WGH- · WGH- · commit 69cc94af94dc · 2024-03-25T21:48:25.000+03:00
Web pages can be served without Content-Type set, in which case
browsers employ content sniffing. Do the same here, in Colly.

While we're at it, change the Content-Type check to something stricter than
mere "html" substring match.
diff --git a/colly.go b/colly.go
@@ -25,6 +25,7 @@ import (
 	"hash/fnv"
 	"io"
 	"log"
+	"mime"
 	"net/http"
 	"net/http/cookiejar"
 	"net/url"
@@ -1117,9 +1118,27 @@ func (c *Collector) handleOnResponseHeaders(r *Response) {
 }
 
 func (c *Collector) handleOnHTML(resp *Response) error {
-	if len(c.htmlCallbacks) == 0 || !strings.Contains(strings.ToLower(resp.Headers.Get("Content-Type")), "html") {
+	if len(c.htmlCallbacks) == 0 {
 		return nil
 	}
+
+	contentType := resp.Headers.Get("Content-Type")
+	if contentType == "" {
+		contentType = http.DetectContentType(resp.Body)
+	}
+	mediaType, _, err := mime.ParseMediaType(contentType)
+	if err != nil && err != mime.ErrInvalidMediaParameter {
+		return fmt.Errorf("malformed Content-Type header value: %w", err)
+	}
+
+	// TODO we also want to parse application/xml as XHTML if has
+	// appropriate doctype
+	switch mediaType {
+	case "text/html", "application/xhtml+xml":
+	default:
+		return nil
+	}
+
 	doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(resp.Body))
 	if err != nil {
 		return err
diff --git a/colly_test.go b/colly_test.go
@@ -52,7 +52,11 @@ func newUnstartedTestServer() *httptest.Server {
 	})
 
 	mux.HandleFunc("/html", func(w http.ResponseWriter, r *http.Request) {
-		w.Header().Set("Content-Type", "text/html")
+		if r.URL.Query().Get("no-content-type") != "" {
+			w.Header()["Content-Type"] = nil
+		} else {
+			w.Header().Set("Content-Type", "text/html")
+		}
 		w.Write([]byte(`<!DOCTYPE html>
 <html>
 <head>
@@ -627,6 +631,34 @@ func TestCollectorOnHTML(t *testing.T) {
 	}
 }
 
+func TestCollectorContentSniffing(t *testing.T) {
+	ts := newTestServer()
+	defer ts.Close()
+
+	c := NewCollector()
+
+	htmlCallbackCalled := false
+
+	c.OnResponse(func(r *Response) {
+		if (*r.Headers)["Content-Type"] != nil {
+			t.Error("Content-Type unexpectedly not nil")
+		}
+	})
+
+	c.OnHTML("html", func(e *HTMLElement) {
+		htmlCallbackCalled = true
+	})
+
+	err := c.Visit(ts.URL + "/html?no-content-type=yes")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if !htmlCallbackCalled {
+		t.Error("OnHTML was not called")
+	}
+}
+
 func TestCollectorURLRevisit(t *testing.T) {
 	ts := newTestServer()
 	defer ts.Close()