After noticing (or remembering? hard to tell) Ben Gamari’s html-parse package, I set out to apply the same approach a colleague and I used in taggy-lens to again be able to easily parse data out of HTML pages.
A few hours of hacking resulted in this repository, which can e.g be used as follows.
{-# LANGUAGE OverloadedStrings #-}
module Main where
import Control.Lens
import Control.Monad (forM_)
import Data.Text (Text)
import Network.Wreq
import System.Environment (getArgs)
import Text.HTML.Tree.Lens
import qualified Data.Text as T
import qualified Data.Text.IO as T
import qualified Data.Text.Lazy.Encoding as LT
type URL = String
scrap :: URL -> ([Node] -> a) -> IO a
scrap url f = go <$> get url
where go r = f (r ^. responseBody.to LT.decodeUtf8.html)
getTitle :: [Node] -> Text
getTitle ns = ns ^. traverse.allNamed "title".allTexts
main :: IO ()
main = do
args <- getArgs
forM_ args $ \url -> do
r <- scrap url getTitle
T.putStrLn $ T.pack url <> " => " <> r$ cabal run html-parse-lens:hackage -- http://blog.fmap.fr http://www.blast-info.fr
http://blog.fmap.fr => From Hask ’Til Dawn
http://www.blast-info.fr => Site d’information, d’actualités et d’investigations indépendantPosted: