plunder

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

Mechanism.hs (11462B)


      1 -- Copyright 2023 The Plunder Authors
      2 -- Use of this source code is governed by a BSD-style license that can be
      3 -- found in the LICENSE file.
      4 
      5 {-# OPTIONS_GHC -Wall -Werror #-}
      6 
      7 {-
      8     This file concerns itself with basic lexing and interpreting
      9     indentation-based layout.  It understands nested structures ([],
     10     (), and {}, and juxtaposition), but doesn't interpret those.
     11 
     12     This process is *total*.  We emit unexpected characters as `FAIL`
     13     tokens.  The parsing otherwise has no failure cases.
     14 -}
     15 
     16 module Rex.Mechanism
     17     ( Line(..), Span(..), Lexi(..), Elem, Clump(..), Tree(..), Frag(..)
     18     , StrType(..)
     19     , runic, wordy
     20     , clumpFrag, spanFrag, spanBar
     21     , BlockState, blockState, treeStep
     22     )
     23 where
     24 
     25 import PlunderPrelude hiding (many, head, last)
     26 
     27 import Data.List.NonEmpty (head, last)
     28 import Prelude            (foldl1)
     29 
     30 import qualified Data.ByteString as BS
     31 
     32 
     33 -- Types -----------------------------------------------------------------------
     34 
     35 data Line = LN
     36     { fil :: !FilePath
     37     , num :: !Int
     38     , byt :: !ByteString
     39     }
     40   deriving (Eq, Ord, Show)
     41 
     42 data Span a = S
     43     { lin :: !Line -- line
     44     , off :: !Int  -- offset
     45     , end :: !Int  -- end
     46     , x :: a
     47     }
     48   deriving (Eq, Ord, Show, Functor)
     49 
     50 data StrType = CURLY | QUOTED
     51   deriving (Eq, Ord, Show)
     52 
     53 data Lexi a
     54     = RUNE
     55     | WORD
     56     | WYTE
     57     | SEMI
     58     | LINE [Span ()]
     59     | CORD StrType
     60     | PARA a
     61     | BRAK a
     62     | FAIL
     63     | TERM
     64   deriving (Eq, Ord, Show, Functor)
     65 
     66 newtype Lexeme = L { t :: Span (Lexi [Lexeme]) }
     67   deriving (Eq, Ord, Show)
     68 
     69 type Elem = Span (Lexi [Clump])
     70 
     71 newtype Clump = C { cs :: Span (NonEmpty Elem) }
     72 
     73 
     74 -- Do Basic Lexing + Nesting ---------------------------------------------------
     75 
     76 lex :: Line -> [Lexeme]
     77 lex ln@(LN _ _ b) = many 0 0
     78   where
     79     wid = length b
     80 
     81     x off end tok = L (S ln off end tok)
     82 
     83     many i ctx = l : (if l.t.x == TERM then [] else many l.t.end ctx)
     84                    where l = one i ctx
     85 
     86     nest cn ctx i = x i (unsafeLast ts).t.end (cn ts)
     87                       where ts = many (i+1) ctx
     88 
     89     eat o f = fromMaybe wid $ fmap (+o) $ BS.findIndex (not . f) $ drop o b
     90 
     91     curl :: Int -> Int -> Int
     92     curl i 0 = i
     93     curl i d = maybe i (\c -> curl (i+1) (d + curlStep c)) (b BS.!? i)
     94                  where curlStep = \case { 123 -> 1; 125 -> (-1); _ -> 0 }
     95 
     96     str i = x i (min wid (succ $ eat (i+1) (/= 34))) (CORD CURLY)
     97 
     98     quote i = let c = fromMaybe 0 (b BS.!? succ i) in
     99               if c==0 || c==32 then x i wid (LINE[]) else
    100               x i (min wid $ succ $ eat (i+2) (/= c)) (CORD QUOTED)
    101 
    102     one i ctx = case fromMaybe 0 (b BS.!? i) of
    103                     40  {- ( -}      -> nest PARA 41  i  {- ) -}
    104                     91  {- [ -}      -> nest BRAK 93  i  {- ] -}
    105                     34  {- " -}      -> str i
    106                     125 {- } -}      -> quote i
    107                     123 {- { -}      -> x i (curl (i+1) 1)         (CORD CURLY)
    108                     0   {- ΓΈ -}      -> x i wid                    TERM
    109                     59  {- ; -}      -> x i wid                    SEMI
    110                     32  {-   -}      -> x i (eat i (== 32))        WYTE
    111                     c | elem c wordy -> x i (eat i (`elem` wordy)) WORD
    112                     c | elem c runic -> x i (eat i (`elem` runic)) RUNE
    113                     c | ctx==c       -> x i (i+1)                  TERM
    114                     _                -> x i (i+1)                  FAIL
    115 
    116 runic, wordy :: ByteString
    117 wordy = encodeUtf8 (pack ("_" <> ['a'..'z'] <> ['A'..'Z'] <> ['0'..'9']))
    118 runic = "$!#%&*+,-./:<=>?@\\^`'|~"
    119 
    120 
    121 -- Merge Multi-Line Strings ----------------------------------------------------
    122 
    123 multiLine :: [Lexeme] -> [Lexeme]
    124 multiLine = \case
    125     ( ( L ln@S{x=LINE acc}
    126       : L S{x=TERM}
    127       : (getMatch ln -> Just (bs, more))
    128       )) -> multiLine ( L ln{x=(LINE (bs:acc))} : more )
    129     []   -> []
    130     l:ls -> l : multiLine ls
    131   where
    132     getMatch s1 toks = do
    133         (s2, more) <- case toks of
    134             L s@S{x=LINE _}               : m -> Just (s,m)
    135             L S{x=WYTE} : L s@S{x=LINE _} : m -> Just (s,m)
    136             _                                 -> Nothing
    137         guard ((s1.off == s2.off))
    138         pure (const () <$> s2, more)
    139 
    140 
    141 -- Clump Juxtaposed Tokens -----------------------------------------------------
    142 
    143 mkClump :: NonEmpty Elem -> Clump
    144 mkClump x = C S{x=x, lin=(head x).lin, off=(head x).off, end=(last x).end}
    145 
    146 lexemeElem :: Lexeme -> Elem
    147 lexemeElem (L (S l o e x)) = S l o e (clump <$> x)
    148 
    149 clump :: [Lexeme] -> [Clump]
    150 clump = go [] . fmap lexemeElem
    151   where
    152     go :: [Elem] -> [Elem] -> [Clump]
    153     go []     []               = []
    154     go []     (i:is) | isSpc i = go [] is
    155     go (t:ts) []               = mkClump (reverse (t:|ts)) : []
    156     go (t:ts) is | isEnd is    = mkClump (reverse (t:|ts)) : go [] is
    157     go ts     (i:is)           = go (i:ts) is
    158 
    159     isSpc S{x=WYTE} = True
    160     isSpc S{x=SEMI} = True
    161     isSpc S{x=TERM} = True
    162     isSpc _         = False
    163 
    164     isEnd ((isSpc->True) : _)                 = True
    165     isEnd (S{x=RUNE}     : [])                = True
    166     isEnd (S{x=RUNE}     : (isSpc->True) : _) = True
    167     isEnd _                                   = False
    168 
    169 
    170 -- Layout Engine (Understand Structure Implied by Indentation) -----------------
    171 
    172 data Tree = LEAF !Clump | NODE !ByteString ![Tree] !(Maybe Tree)
    173 
    174 data Frag a = WOLF ByteString a | LAMB a
    175 
    176 data Item = I ByteString [Tree] (Maybe Tree)
    177 
    178 fTree :: Frag Clump -> Tree
    179 fTree (LAMB c)   = LEAF c
    180 fTree (WOLF b _) = NODE b [] Nothing
    181 
    182 iTree :: Item -> Tree
    183 iTree (I t x k) = NODE t (reverse x) k
    184 
    185 close :: Int -> [(Int, Item)] -> [(Int, Item)]
    186 close _ []                   = []
    187 close p (i:is)  | p >= fst i = (i:is)
    188 close p (i:j:k)              = let ij = merge (iTree <$> i) j in close p (ij:k)
    189 close _ _                    = error "indent too small.  Bug in block splitter"
    190 
    191 pushOnto :: [(Int, Item)] -> (Int, Frag Clump) -> [(Int, Item)]
    192 pushOnto stack (fp,f) =
    193     let stc = close fp stack
    194     in case (f, stc) of
    195         (LAMB _,   [])   -> error "impossible; this case is already handled"
    196         (WOLF r _, is)   -> ((fp, I r [] Nothing) : is)
    197         (LAMB _,   i:is) -> (merge (fp, fTree f) i : is)
    198 
    199 merge :: (Int, Tree) -> (Int, Item) -> (Int, Item)
    200 merge (rp,r) (ip,i) =
    201     case (compare rp ip, i) of
    202         (LT , _              ) -> error "impossible merge"
    203         (_  , I t cs (Just k)) -> (ip, I t (k:cs) (Just r))
    204         (EQ , I t cs Nothing ) -> (ip, I t cs (Just r))
    205         (GT , I t cs Nothing ) -> (ip, I t (r:cs) Nothing)
    206 
    207 layout :: [(Int, Frag Clump)] -> [Tree]
    208 layout = \case
    209     []                 -> []
    210     (_, f@LAMB{}) : fs -> fTree f : layout fs
    211     (p, WOLF r _) : fs -> [iTree $ snd $ foldl1 forceMerge pushed]
    212                             where pushed = pushAll (p, I r [] Nothing) fs
    213   where
    214     pushAll :: (Int, Item) -> [(Int, Frag Clump)] -> [(Int, Item)]
    215     pushAll pf fs = foldl' pushOnto [pf] fs
    216 
    217     forceMerge :: (Int, Item) -> (Int, Item) -> (Int, Item)
    218     forceMerge a b = merge (iTree <$> a) b
    219 
    220 
    221 -- Block Splitter --------------------------------------------------------------
    222 
    223 data LineCat = VOID | NOTE Int | OPEN Int Int | TEXT Int | SING Int
    224   deriving (Eq, Ord, Show)
    225 
    226 data BlockBuffer
    227     = WOODS
    228     | TEXTY { ls :: ![[Lexeme]], deep :: !Int }
    229     | BLOCK { ls :: ![[Lexeme]], deep :: !Int, prev :: !Int }
    230 
    231 data BlockState = BS FilePath Int BlockBuffer
    232 
    233 lineCat :: [Lexeme] -> LineCat
    234 lineCat  = \case
    235     []                                 -> VOID
    236     L S{x=TERM{}}                 : _  -> VOID
    237     L s@S{x=SEMI{}}               : _  -> NOTE s.off
    238     L s@S{x=(LINE _)}             : _  -> TEXT s.off
    239     L s@S{x=RUNE{}} : L S{x=TERM} : _  -> OPEN (s.end - 1) s.off
    240     L s@S{x=RUNE{}} : L S{x=WYTE} : _  -> OPEN (s.end - 1) s.off
    241     L s@S{x=RUNE{}} : L S{x=SEMI} : _  -> OPEN (s.end - 1) s.off
    242     L(S{x=WYTE{}})                : ls -> lineCat ls
    243     L s                           : _  -> SING s.off
    244 
    245 blockDepth :: LineCat -> Int
    246 blockDepth (TEXT d) = d
    247 blockDepth (OPEN d _) = d
    248 blockDepth VOID     = 0
    249 blockDepth (NOTE d) = d
    250 blockDepth (SING d) = d
    251 
    252 absoluteDepth :: LineCat -> Int
    253 absoluteDepth (TEXT d)   = d
    254 absoluteDepth (OPEN _ d) = d
    255 absoluteDepth VOID       = 0
    256 absoluteDepth (NOTE d)   = d
    257 absoluteDepth (SING d)   = d
    258 
    259 blockStep
    260     :: BlockState
    261     -> Maybe [Lexeme]
    262     -> (BlockState, [[[Lexeme]]])
    263 blockStep (BS fn lno st) inp = case inp of
    264     Nothing ->
    265         (BS fn lno WOODS,) $
    266         case st of
    267             WOODS     -> []
    268             TEXTY{ls} -> [reverse ls]
    269             BLOCK{ls} -> [reverse ls]
    270 
    271     Just l ->
    272         let
    273             c = lineCat l
    274             x s o = (BS fn (lno+1) s, o)
    275             bd = blockDepth c
    276             ad = absoluteDepth c
    277             breakIt ls = (reverse ls :) <$> blockStep (BS fn lno WOODS) inp
    278         in case (st, c) of
    279             (WOODS, TEXT{}) -> x (TEXTY [l] bd) []
    280             (WOODS, OPEN{}) -> x (BLOCK [l] bd (ad+1)) []
    281             (WOODS, VOID)   -> x WOODS []
    282             (WOODS, NOTE{}) -> x WOODS []
    283             (WOODS, SING{}) -> x WOODS [[l]]
    284 
    285             (TEXTY{..}, TEXT d) | d==deep -> x TEXTY{ls=l:ls,..} []
    286             (TEXTY{ls}, _)                -> breakIt ls
    287 
    288             -- If we see a blank line, but the previous line was not
    289             -- blank, then don't break the block yet.  We will decide
    290             -- once we see the next line.
    291             --
    292             -- However, if the previous line was blank too (depth = -1),
    293             -- then this is a double-blank line, so we break here.
    294             (BLOCK{ls,deep,prev}, VOID) ->
    295                 if prev > 0
    296                 then x BLOCK{deep,prev=0,ls} []
    297                 else breakIt ls
    298 
    299             -- If we already forgave a blank line, but we see another
    300             -- input that maximally unindented, then break before
    301             -- this line.
    302             (BLOCK{ls,prev=0}, _) | ad==0 ->
    303                 (reverse ls :) <$> blockStep (BS fn lno WOODS) inp
    304 
    305             -- If we see an input that's indented enough to be included
    306             -- in this block, then include it in the block and remember
    307             -- it's depth.
    308             --
    309             -- However, if it's *not* indented enough to be included in
    310             -- the block, break before that line.
    311             (BLOCK{ls,deep}, _) ->
    312                 if bd >= deep
    313                 then x BLOCK{deep,prev=ad+1,ls=(l:ls)} []
    314                 else breakIt ls
    315 
    316 
    317 -- Take Lines, Spit out Trees --------------------------------------------------
    318 
    319 spanFrag :: Span (Lexi a) -> Frag (Span (Lexi a))
    320 spanFrag s@S{x=RUNE} = WOLF (spanBar s) s
    321 spanFrag s           = LAMB s
    322 
    323 spanBar :: Span a -> ByteString
    324 spanBar s = take (s.end - s.off) $ drop s.off s.lin.byt
    325 
    326 clumpFrag :: Clump -> (Int, Frag Clump)
    327 clumpFrag c@(C cs) =
    328     case cs.x of
    329         _ :| (_:_) -> (cs.off, LAMB c)
    330         e :| []    -> case spanFrag e of
    331                           WOLF rune _ -> (cs.off + length rune - 1, WOLF rune c)
    332                           LAMB _      -> (cs.off, LAMB c)
    333 
    334 blockState :: FilePath -> BlockState
    335 blockState fn = BS fn 1 WOODS
    336 
    337 treeStep :: BlockState -> Maybe ByteString -> (BlockState, [(Int, [Tree])])
    338 treeStep st@(BS fil num _) mInp = (st2, treeOut)
    339   where
    340     (st2, out) = blockStep st (mInp <&> \byt -> lex (LN fil num byt))
    341     treeOut = out <&> \bls ->
    342                   ( (unsafeHead (unsafeHead bls)).t.lin.num
    343                   , layout $ fmap clumpFrag $ clump $ multiLine $ concat bls
    344                   )