diff --git a/feeds/handler/crawl.go b/feeds/handler/crawl.go index ead9912..8cfc4f2 100644 --- a/feeds/handler/crawl.go +++ b/feeds/handler/crawl.go @@ -8,6 +8,7 @@ import ( "net/url" "github.com/SlyMarbo/rss" + "github.com/micro/services/feeds/parser" log "github.com/micro/micro/v3/service/logger" feeds "github.com/micro/services/feeds/proto" posts "github.com/micro/services/posts/proto" @@ -69,8 +70,15 @@ func (e *Feeds) fetch(f *feeds.Feed) error { content = item.Summary } + // if we have a parser which returns content use it + // e.g cnbc + c, err := parser.Parse(item.Link) + if err == nil && len(c) > 0 { + content = c + } + // @todo make this optional - _, err := e.postsService.Save(context.TODO(), &posts.SaveRequest{ + _, err = e.postsService.Save(context.TODO(), &posts.SaveRequest{ Id: id, Title: item.Title, Content: content, diff --git a/feeds/parser/parser.go b/feeds/parser/parser.go new file mode 100644 index 0000000..7391540 --- /dev/null +++ b/feeds/parser/parser.go @@ -0,0 +1,52 @@ +package parser + +import ( + "errors" + "net/http" + "net/url" + + "github.com/PuerkitoBio/goquery" +) + +var ( + parsers = map[string]Parser{ + "cnbc.com": cnbcParser, + "www.cnbc.com": cnbcParser, + } +) + +type Parser func(string) (string, error) + +func Parse(uri string) (string, error) { + u, err := url.Parse(uri) + if err != nil { + return "", err + } + + if v, ok := parsers[u.Host]; ok { + return v(uri) + } + return "", errors.New("no parser for url") +} + +func cnbcParser(url string) (string, error) { + // Request the HTML page. + res, err := http.Get(url) + if err != nil { + return "", err + } + + defer res.Body.Close() + + if res.StatusCode != 200 { + return "", errors.New("bad status code") + } + + // Load the HTML document + doc, err := goquery.NewDocumentFromReader(res.Body) + if err != nil { + return "", err + } + + return doc.Find(".PageBuilder-col-9").Html() +} diff --git a/go.mod b/go.mod index 37ae001..abb4645 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,7 @@ go 1.14 require ( github.com/Masterminds/semver/v3 v3.1.1 + github.com/PuerkitoBio/goquery v1.6.1 github.com/SlyMarbo/rss v1.0.1 github.com/getkin/kin-openapi v0.26.0 github.com/golang/protobuf v1.4.3 diff --git a/go.sum b/go.sum index 5c16e91..4cfb865 100644 --- a/go.sum +++ b/go.sum @@ -49,6 +49,8 @@ github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym github.com/Masterminds/semver/v3 v3.1.1 h1:hLg3sBzpNErnxhQtUy/mmLR2I9foDujNK030IGemrRc= github.com/Masterminds/semver/v3 v3.1.1/go.mod h1:VPu/7SZ7ePZ3QOrcuXROw5FAcLl4a0cBrbBpGY/8hQs= github.com/OpenDNS/vegadns2client v0.0.0-20180418235048-a3fa4a771d87/go.mod h1:iGLljf5n9GjT6kc0HBvyI1nOKnGQbNB66VzSNbK5iks= +github.com/PuerkitoBio/goquery v1.6.1 h1:FgjbQZKl5HTmcn4sKBgvx8vv63nhyhIpv7lJpFGCWpk= +github.com/PuerkitoBio/goquery v1.6.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= github.com/Shopify/sarama v1.19.0/go.mod h1:FVkBWblsNy7DGZRfXLU0O9RCGt5g3g3yEuWXgklEdEo= github.com/Shopify/toxiproxy v2.1.4+incompatible/go.mod h1:OXgGpZ6Cli1/URJOF1DMxUHB2q5Ap20/P/eIdh4G0pI= github.com/SlyMarbo/rss v1.0.1 h1:fiaIU5UhcXauVOniHOIocWG7uj8Ej6pHNarMGPJilzA= @@ -58,6 +60,8 @@ github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuy github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= github.com/aliyun/alibaba-cloud-sdk-go v0.0.0-20190808125512-07798873deee/go.mod h1:myCDvQSzCW+wB1WAlocEru4wMGJxy+vlxHdhegi1CDQ= github.com/aliyun/aliyun-oss-go-sdk v0.0.0-20190307165228-86c17b95fcd5/go.mod h1:T/Aws4fEfogEE9v+HPhhw+CntffsBHJ8nXQCwKr0/g8= +github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo= +github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= github.com/apache/thrift v0.12.0/go.mod h1:cp2SuWMxlEZw2r+iP2GNCdIi4C1qmUzdZFSVb+bacwQ= github.com/aws/aws-sdk-go v1.23.0/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo= github.com/axgle/mahonia v0.0.0-20180208002826-3358181d7394 h1:OYA+5W64v3OgClL+IrOD63t4i/RW7RqrAVl9LTZ9UqQ= @@ -519,6 +523,7 @@ golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzB golang.org/x/mod v0.1.1-0.20191107180719-034126e5016b/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180611182652-db08ff08e862/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=