mirror of
https://github.com/kevin-DL/services.git
synced 2026-01-11 19:04:35 +00:00
* add rss service * git ignore rss * update readme * go fmt * maintain map by url * make rss service multi-tenant
64 lines
1.1 KiB
Go
64 lines
1.1 KiB
Go
package parser
|
|
|
|
import (
|
|
"errors"
|
|
"net/http"
|
|
"net/url"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
)
|
|
|
|
var (
|
|
parsers = map[string]Parser{
|
|
"a16z.com": a16zParser,
|
|
"cnbc.com": cnbcParser,
|
|
"www.cnbc.com": cnbcParser,
|
|
}
|
|
)
|
|
|
|
type Parser func(string) (string, error)
|
|
|
|
func Parse(uri string) (string, error) {
|
|
u, err := url.Parse(uri)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
if v, ok := parsers[u.Host]; ok {
|
|
return v(uri)
|
|
}
|
|
return "", errors.New("no parser for url")
|
|
}
|
|
|
|
func classParser(class string) Parser {
|
|
return func(url string) (string, error) {
|
|
// Request the HTML page.
|
|
res, err := http.Get(url)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
defer res.Body.Close()
|
|
|
|
if res.StatusCode != 200 {
|
|
return "", errors.New("bad status code")
|
|
}
|
|
|
|
// Load the HTML document
|
|
doc, err := goquery.NewDocumentFromReader(res.Body)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
return doc.Find(class).Html()
|
|
}
|
|
}
|
|
|
|
func a16zParser(url string) (string, error) {
|
|
return classParser(".blog-content")(url)
|
|
}
|
|
|
|
func cnbcParser(url string) (string, error) {
|
|
return classParser(".PageBuilder-col-9")(url)
|
|
}
|