After noticing (or remembering? hard to tell) Ben Gamari’s html-parse package, I set out to apply the same approach a colleague and I used in taggy-lens to again be able to easily parse data out of HTML pages.
A few hours of hacking resulted in this repository, which can e.g be used as follows.
{-# LANGUAGE OverloadedStrings #-}
module Main where
import Control.Lens
import Control.Monad (forM_)
import Data.Text (Text)
import Network.Wreq
import System.Environment (getArgs)
import Text.HTML.Tree.Lens
import qualified Data.Text as T
import qualified Data.Text.IO as T
import qualified Data.Text.Lazy.Encoding as LT
type URL = String
scrap :: URL -> ([Node] -> a) -> IO a
= go <$> get url
scrap url f where go r = f (r ^. responseBody.to LT.decodeUtf8.html)
getTitle :: [Node] -> Text
= ns ^. traverse.allNamed "title".allTexts
getTitle ns
main :: IO ()
= do
main <- getArgs
args $ \url -> do
forM_ args <- scrap url getTitle
r $ T.pack url <> " => " <> r T.putStrLn
$ cabal run html-parse-lens:hackage -- http://blog.fmap.fr http://www.blast-info.fr
http://blog.fmap.fr => From Hask ’Til Dawn
http://www.blast-info.fr => Site d’information, d’actualités et d’investigations indépendant
Posted: