add parser to feeds service

This commit is contained in:
Asim Aslam
2021-02-27 20:14:17 +00:00
parent a453566c26
commit 984bc4fb87
4 changed files with 67 additions and 1 deletions

View File

@@ -8,6 +8,7 @@ import (
"net/url"
"github.com/SlyMarbo/rss"
"github.com/micro/services/feeds/parser"
log "github.com/micro/micro/v3/service/logger"
feeds "github.com/micro/services/feeds/proto"
posts "github.com/micro/services/posts/proto"
@@ -69,8 +70,15 @@ func (e *Feeds) fetch(f *feeds.Feed) error {
content = item.Summary
}
// if we have a parser which returns content use it
// e.g cnbc
c, err := parser.Parse(item.Link)
if err == nil && len(c) > 0 {
content = c
}
// @todo make this optional
_, err := e.postsService.Save(context.TODO(), &posts.SaveRequest{
_, err = e.postsService.Save(context.TODO(), &posts.SaveRequest{
Id: id,
Title: item.Title,
Content: content,

52
feeds/parser/parser.go Normal file
View File

@@ -0,0 +1,52 @@
package parser
import (
"errors"
"net/http"
"net/url"
"github.com/PuerkitoBio/goquery"
)
var (
parsers = map[string]Parser{
"cnbc.com": cnbcParser,
"www.cnbc.com": cnbcParser,
}
)
type Parser func(string) (string, error)
func Parse(uri string) (string, error) {
u, err := url.Parse(uri)
if err != nil {
return "", err
}
if v, ok := parsers[u.Host]; ok {
return v(uri)
}
return "", errors.New("no parser for url")
}
func cnbcParser(url string) (string, error) {
// Request the HTML page.
res, err := http.Get(url)
if err != nil {
return "", err
}
defer res.Body.Close()
if res.StatusCode != 200 {
return "", errors.New("bad status code")
}
// Load the HTML document
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
return "", err
}
return doc.Find(".PageBuilder-col-9").Html()
}