Skip to content

Commit 69cc94a

Browse files
committed
Implement content sniffing for HTML parsing
Web pages can be served without Content-Type set, in which case browsers employ content sniffing. Do the same here, in Colly. While we're at it, change the Content-Type check to something stricter than mere "html" substring match.
1 parent 4ccfe78 commit 69cc94a

File tree

2 files changed

+53
-2
lines changed

2 files changed

+53
-2
lines changed

colly.go

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"hash/fnv"
2626
"io"
2727
"log"
28+
"mime"
2829
"net/http"
2930
"net/http/cookiejar"
3031
"net/url"
@@ -1117,9 +1118,27 @@ func (c *Collector) handleOnResponseHeaders(r *Response) {
11171118
}
11181119

11191120
func (c *Collector) handleOnHTML(resp *Response) error {
1120-
if len(c.htmlCallbacks) == 0 || !strings.Contains(strings.ToLower(resp.Headers.Get("Content-Type")), "html") {
1121+
if len(c.htmlCallbacks) == 0 {
11211122
return nil
11221123
}
1124+
1125+
contentType := resp.Headers.Get("Content-Type")
1126+
if contentType == "" {
1127+
contentType = http.DetectContentType(resp.Body)
1128+
}
1129+
mediaType, _, err := mime.ParseMediaType(contentType)
1130+
if err != nil && err != mime.ErrInvalidMediaParameter {
1131+
return fmt.Errorf("malformed Content-Type header value: %w", err)
1132+
}
1133+
1134+
// TODO we also want to parse application/xml as XHTML if has
1135+
// appropriate doctype
1136+
switch mediaType {
1137+
case "text/html", "application/xhtml+xml":
1138+
default:
1139+
return nil
1140+
}
1141+
11231142
doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(resp.Body))
11241143
if err != nil {
11251144
return err

colly_test.go

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,11 @@ func newUnstartedTestServer() *httptest.Server {
5252
})
5353

5454
mux.HandleFunc("/html", func(w http.ResponseWriter, r *http.Request) {
55-
w.Header().Set("Content-Type", "text/html")
55+
if r.URL.Query().Get("no-content-type") != "" {
56+
w.Header()["Content-Type"] = nil
57+
} else {
58+
w.Header().Set("Content-Type", "text/html")
59+
}
5660
w.Write([]byte(`<!DOCTYPE html>
5761
<html>
5862
<head>
@@ -627,6 +631,34 @@ func TestCollectorOnHTML(t *testing.T) {
627631
}
628632
}
629633

634+
func TestCollectorContentSniffing(t *testing.T) {
635+
ts := newTestServer()
636+
defer ts.Close()
637+
638+
c := NewCollector()
639+
640+
htmlCallbackCalled := false
641+
642+
c.OnResponse(func(r *Response) {
643+
if (*r.Headers)["Content-Type"] != nil {
644+
t.Error("Content-Type unexpectedly not nil")
645+
}
646+
})
647+
648+
c.OnHTML("html", func(e *HTMLElement) {
649+
htmlCallbackCalled = true
650+
})
651+
652+
err := c.Visit(ts.URL + "/html?no-content-type=yes")
653+
if err != nil {
654+
t.Fatal(err)
655+
}
656+
657+
if !htmlCallbackCalled {
658+
t.Error("OnHTML was not called")
659+
}
660+
}
661+
630662
func TestCollectorURLRevisit(t *testing.T) {
631663
ts := newTestServer()
632664
defer ts.Close()

0 commit comments

Comments
 (0)