clx-browser

[ACTIVE] a smol browser based off of circumflex
git clone git://git.figbert.com/clx-browser.git
Log | Files | Refs | README | LICENSE

reader.go (1396B)


      1 package main
      2 
      3 import (
      4 	"fmt"
      5 	"net/http"
      6 	"net/url"
      7 	"strings"
      8 	"time"
      9 
     10 	md "github.com/JohannesKaufmann/html-to-markdown"
     11 	"github.com/JohannesKaufmann/html-to-markdown/plugin"
     12 
     13 	"github.com/go-shiori/go-readability"
     14 
     15 	"github.com/PuerkitoBio/goquery"
     16 
     17 	"github.com/kozmos/clean-url"
     18 )
     19 
     20 func getArticle(URL string) (string, string) {
     21 	client := http.Client{Timeout: 5 * time.Second}
     22 	response, err := client.Get(URL)
     23 	if err != nil {
     24 		panic(err)
     25 	}
     26 	defer response.Body.Close()
     27 
     28 	pageURL, err := url.Parse(URL)
     29 	if err != nil {
     30 		panic(err)
     31 	}
     32 
     33 	art, err := readability.FromReader(response.Body, pageURL)
     34 	if err != nil {
     35 		panic(err)
     36 	}
     37 
     38 	href := md.Rule{
     39 		Filter: []string{"a"},
     40 		Replacement: func(content string, selec *goquery.Selection, opt *md.Options) *string {
     41 			href, ok := selec.Attr("href")
     42 			if !ok {
     43 				return md.String(strings.TrimSpace(content))
     44 			}
     45 
     46 			parsedURL, err := url.Parse(href)
     47 			if err != nil {
     48 				panic(err)
     49 			}
     50 
     51 			return md.String(
     52 				fmt.Sprintf(
     53 					"[%s](%s://%s)",
     54 					strings.TrimSpace(content), parsedURL.Scheme,
     55 					cleanurl.Clean(opt.GetAbsoluteURL(selec, href, pageURL.Host)),
     56 				),
     57 			)
     58 		},
     59 	}
     60 
     61 	opt := &md.Options{}
     62 	converter := md.NewConverter(URL, true, opt)
     63 	converter.AddRules(href)
     64 	converter.Use(plugin.Table())
     65 	markdown, err := converter.ConvertString(art.Content)
     66 	if err != nil {
     67 		panic(err)
     68 	}
     69 
     70 	return art.Title, markdown
     71 }