File tree Expand file tree Collapse file tree 2 files changed +53
-2
lines changed Expand file tree Collapse file tree 2 files changed +53
-2
lines changed Original file line number Diff line number Diff line change @@ -25,6 +25,7 @@ import (
25
25
"hash/fnv"
26
26
"io"
27
27
"log"
28
+ "mime"
28
29
"net/http"
29
30
"net/http/cookiejar"
30
31
"net/url"
@@ -1117,9 +1118,27 @@ func (c *Collector) handleOnResponseHeaders(r *Response) {
1117
1118
}
1118
1119
1119
1120
func (c * Collector ) handleOnHTML (resp * Response ) error {
1120
- if len (c .htmlCallbacks ) == 0 || ! strings . Contains ( strings . ToLower ( resp . Headers . Get ( "Content-Type" )), "html" ) {
1121
+ if len (c .htmlCallbacks ) == 0 {
1121
1122
return nil
1122
1123
}
1124
+
1125
+ contentType := resp .Headers .Get ("Content-Type" )
1126
+ if contentType == "" {
1127
+ contentType = http .DetectContentType (resp .Body )
1128
+ }
1129
+ mediaType , _ , err := mime .ParseMediaType (contentType )
1130
+ if err != nil && err != mime .ErrInvalidMediaParameter {
1131
+ return fmt .Errorf ("malformed Content-Type header value: %w" , err )
1132
+ }
1133
+
1134
+ // TODO we also want to parse application/xml as XHTML if has
1135
+ // appropriate doctype
1136
+ switch mediaType {
1137
+ case "text/html" , "application/xhtml+xml" :
1138
+ default :
1139
+ return nil
1140
+ }
1141
+
1123
1142
doc , err := goquery .NewDocumentFromReader (bytes .NewBuffer (resp .Body ))
1124
1143
if err != nil {
1125
1144
return err
Original file line number Diff line number Diff line change @@ -52,7 +52,11 @@ func newUnstartedTestServer() *httptest.Server {
52
52
})
53
53
54
54
mux .HandleFunc ("/html" , func (w http.ResponseWriter , r * http.Request ) {
55
- w .Header ().Set ("Content-Type" , "text/html" )
55
+ if r .URL .Query ().Get ("no-content-type" ) != "" {
56
+ w .Header ()["Content-Type" ] = nil
57
+ } else {
58
+ w .Header ().Set ("Content-Type" , "text/html" )
59
+ }
56
60
w .Write ([]byte (`<!DOCTYPE html>
57
61
<html>
58
62
<head>
@@ -627,6 +631,34 @@ func TestCollectorOnHTML(t *testing.T) {
627
631
}
628
632
}
629
633
634
+ func TestCollectorContentSniffing (t * testing.T ) {
635
+ ts := newTestServer ()
636
+ defer ts .Close ()
637
+
638
+ c := NewCollector ()
639
+
640
+ htmlCallbackCalled := false
641
+
642
+ c .OnResponse (func (r * Response ) {
643
+ if (* r .Headers )["Content-Type" ] != nil {
644
+ t .Error ("Content-Type unexpectedly not nil" )
645
+ }
646
+ })
647
+
648
+ c .OnHTML ("html" , func (e * HTMLElement ) {
649
+ htmlCallbackCalled = true
650
+ })
651
+
652
+ err := c .Visit (ts .URL + "/html?no-content-type=yes" )
653
+ if err != nil {
654
+ t .Fatal (err )
655
+ }
656
+
657
+ if ! htmlCallbackCalled {
658
+ t .Error ("OnHTML was not called" )
659
+ }
660
+ }
661
+
630
662
func TestCollectorURLRevisit (t * testing.T ) {
631
663
ts := newTestServer ()
632
664
defer ts .Close ()
You can’t perform that action at this time.
0 commit comments