Mechanism.hs (11462B)
1 -- Copyright 2023 The Plunder Authors 2 -- Use of this source code is governed by a BSD-style license that can be 3 -- found in the LICENSE file. 4 5 {-# OPTIONS_GHC -Wall -Werror #-} 6 7 {- 8 This file concerns itself with basic lexing and interpreting 9 indentation-based layout. It understands nested structures ([], 10 (), and {}, and juxtaposition), but doesn't interpret those. 11 12 This process is *total*. We emit unexpected characters as `FAIL` 13 tokens. The parsing otherwise has no failure cases. 14 -} 15 16 module Rex.Mechanism 17 ( Line(..), Span(..), Lexi(..), Elem, Clump(..), Tree(..), Frag(..) 18 , StrType(..) 19 , runic, wordy 20 , clumpFrag, spanFrag, spanBar 21 , BlockState, blockState, treeStep 22 ) 23 where 24 25 import PlunderPrelude hiding (many, head, last) 26 27 import Data.List.NonEmpty (head, last) 28 import Prelude (foldl1) 29 30 import qualified Data.ByteString as BS 31 32 33 -- Types ----------------------------------------------------------------------- 34 35 data Line = LN 36 { fil :: !FilePath 37 , num :: !Int 38 , byt :: !ByteString 39 } 40 deriving (Eq, Ord, Show) 41 42 data Span a = S 43 { lin :: !Line -- line 44 , off :: !Int -- offset 45 , end :: !Int -- end 46 , x :: a 47 } 48 deriving (Eq, Ord, Show, Functor) 49 50 data StrType = CURLY | QUOTED 51 deriving (Eq, Ord, Show) 52 53 data Lexi a 54 = RUNE 55 | WORD 56 | WYTE 57 | SEMI 58 | LINE [Span ()] 59 | CORD StrType 60 | PARA a 61 | BRAK a 62 | FAIL 63 | TERM 64 deriving (Eq, Ord, Show, Functor) 65 66 newtype Lexeme = L { t :: Span (Lexi [Lexeme]) } 67 deriving (Eq, Ord, Show) 68 69 type Elem = Span (Lexi [Clump]) 70 71 newtype Clump = C { cs :: Span (NonEmpty Elem) } 72 73 74 -- Do Basic Lexing + Nesting --------------------------------------------------- 75 76 lex :: Line -> [Lexeme] 77 lex ln@(LN _ _ b) = many 0 0 78 where 79 wid = length b 80 81 x off end tok = L (S ln off end tok) 82 83 many i ctx = l : (if l.t.x == TERM then [] else many l.t.end ctx) 84 where l = one i ctx 85 86 nest cn ctx i = x i (unsafeLast ts).t.end (cn ts) 87 where ts = many (i+1) ctx 88 89 eat o f = fromMaybe wid $ fmap (+o) $ BS.findIndex (not . f) $ drop o b 90 91 curl :: Int -> Int -> Int 92 curl i 0 = i 93 curl i d = maybe i (\c -> curl (i+1) (d + curlStep c)) (b BS.!? i) 94 where curlStep = \case { 123 -> 1; 125 -> (-1); _ -> 0 } 95 96 str i = x i (min wid (succ $ eat (i+1) (/= 34))) (CORD CURLY) 97 98 quote i = let c = fromMaybe 0 (b BS.!? succ i) in 99 if c==0 || c==32 then x i wid (LINE[]) else 100 x i (min wid $ succ $ eat (i+2) (/= c)) (CORD QUOTED) 101 102 one i ctx = case fromMaybe 0 (b BS.!? i) of 103 40 {- ( -} -> nest PARA 41 i {- ) -} 104 91 {- [ -} -> nest BRAK 93 i {- ] -} 105 34 {- " -} -> str i 106 125 {- } -} -> quote i 107 123 {- { -} -> x i (curl (i+1) 1) (CORD CURLY) 108 0 {- ΓΈ -} -> x i wid TERM 109 59 {- ; -} -> x i wid SEMI 110 32 {- -} -> x i (eat i (== 32)) WYTE 111 c | elem c wordy -> x i (eat i (`elem` wordy)) WORD 112 c | elem c runic -> x i (eat i (`elem` runic)) RUNE 113 c | ctx==c -> x i (i+1) TERM 114 _ -> x i (i+1) FAIL 115 116 runic, wordy :: ByteString 117 wordy = encodeUtf8 (pack ("_" <> ['a'..'z'] <> ['A'..'Z'] <> ['0'..'9'])) 118 runic = "$!#%&*+,-./:<=>?@\\^`'|~" 119 120 121 -- Merge Multi-Line Strings ---------------------------------------------------- 122 123 multiLine :: [Lexeme] -> [Lexeme] 124 multiLine = \case 125 ( ( L ln@S{x=LINE acc} 126 : L S{x=TERM} 127 : (getMatch ln -> Just (bs, more)) 128 )) -> multiLine ( L ln{x=(LINE (bs:acc))} : more ) 129 [] -> [] 130 l:ls -> l : multiLine ls 131 where 132 getMatch s1 toks = do 133 (s2, more) <- case toks of 134 L s@S{x=LINE _} : m -> Just (s,m) 135 L S{x=WYTE} : L s@S{x=LINE _} : m -> Just (s,m) 136 _ -> Nothing 137 guard ((s1.off == s2.off)) 138 pure (const () <$> s2, more) 139 140 141 -- Clump Juxtaposed Tokens ----------------------------------------------------- 142 143 mkClump :: NonEmpty Elem -> Clump 144 mkClump x = C S{x=x, lin=(head x).lin, off=(head x).off, end=(last x).end} 145 146 lexemeElem :: Lexeme -> Elem 147 lexemeElem (L (S l o e x)) = S l o e (clump <$> x) 148 149 clump :: [Lexeme] -> [Clump] 150 clump = go [] . fmap lexemeElem 151 where 152 go :: [Elem] -> [Elem] -> [Clump] 153 go [] [] = [] 154 go [] (i:is) | isSpc i = go [] is 155 go (t:ts) [] = mkClump (reverse (t:|ts)) : [] 156 go (t:ts) is | isEnd is = mkClump (reverse (t:|ts)) : go [] is 157 go ts (i:is) = go (i:ts) is 158 159 isSpc S{x=WYTE} = True 160 isSpc S{x=SEMI} = True 161 isSpc S{x=TERM} = True 162 isSpc _ = False 163 164 isEnd ((isSpc->True) : _) = True 165 isEnd (S{x=RUNE} : []) = True 166 isEnd (S{x=RUNE} : (isSpc->True) : _) = True 167 isEnd _ = False 168 169 170 -- Layout Engine (Understand Structure Implied by Indentation) ----------------- 171 172 data Tree = LEAF !Clump | NODE !ByteString ![Tree] !(Maybe Tree) 173 174 data Frag a = WOLF ByteString a | LAMB a 175 176 data Item = I ByteString [Tree] (Maybe Tree) 177 178 fTree :: Frag Clump -> Tree 179 fTree (LAMB c) = LEAF c 180 fTree (WOLF b _) = NODE b [] Nothing 181 182 iTree :: Item -> Tree 183 iTree (I t x k) = NODE t (reverse x) k 184 185 close :: Int -> [(Int, Item)] -> [(Int, Item)] 186 close _ [] = [] 187 close p (i:is) | p >= fst i = (i:is) 188 close p (i:j:k) = let ij = merge (iTree <$> i) j in close p (ij:k) 189 close _ _ = error "indent too small. Bug in block splitter" 190 191 pushOnto :: [(Int, Item)] -> (Int, Frag Clump) -> [(Int, Item)] 192 pushOnto stack (fp,f) = 193 let stc = close fp stack 194 in case (f, stc) of 195 (LAMB _, []) -> error "impossible; this case is already handled" 196 (WOLF r _, is) -> ((fp, I r [] Nothing) : is) 197 (LAMB _, i:is) -> (merge (fp, fTree f) i : is) 198 199 merge :: (Int, Tree) -> (Int, Item) -> (Int, Item) 200 merge (rp,r) (ip,i) = 201 case (compare rp ip, i) of 202 (LT , _ ) -> error "impossible merge" 203 (_ , I t cs (Just k)) -> (ip, I t (k:cs) (Just r)) 204 (EQ , I t cs Nothing ) -> (ip, I t cs (Just r)) 205 (GT , I t cs Nothing ) -> (ip, I t (r:cs) Nothing) 206 207 layout :: [(Int, Frag Clump)] -> [Tree] 208 layout = \case 209 [] -> [] 210 (_, f@LAMB{}) : fs -> fTree f : layout fs 211 (p, WOLF r _) : fs -> [iTree $ snd $ foldl1 forceMerge pushed] 212 where pushed = pushAll (p, I r [] Nothing) fs 213 where 214 pushAll :: (Int, Item) -> [(Int, Frag Clump)] -> [(Int, Item)] 215 pushAll pf fs = foldl' pushOnto [pf] fs 216 217 forceMerge :: (Int, Item) -> (Int, Item) -> (Int, Item) 218 forceMerge a b = merge (iTree <$> a) b 219 220 221 -- Block Splitter -------------------------------------------------------------- 222 223 data LineCat = VOID | NOTE Int | OPEN Int Int | TEXT Int | SING Int 224 deriving (Eq, Ord, Show) 225 226 data BlockBuffer 227 = WOODS 228 | TEXTY { ls :: ![[Lexeme]], deep :: !Int } 229 | BLOCK { ls :: ![[Lexeme]], deep :: !Int, prev :: !Int } 230 231 data BlockState = BS FilePath Int BlockBuffer 232 233 lineCat :: [Lexeme] -> LineCat 234 lineCat = \case 235 [] -> VOID 236 L S{x=TERM{}} : _ -> VOID 237 L s@S{x=SEMI{}} : _ -> NOTE s.off 238 L s@S{x=(LINE _)} : _ -> TEXT s.off 239 L s@S{x=RUNE{}} : L S{x=TERM} : _ -> OPEN (s.end - 1) s.off 240 L s@S{x=RUNE{}} : L S{x=WYTE} : _ -> OPEN (s.end - 1) s.off 241 L s@S{x=RUNE{}} : L S{x=SEMI} : _ -> OPEN (s.end - 1) s.off 242 L(S{x=WYTE{}}) : ls -> lineCat ls 243 L s : _ -> SING s.off 244 245 blockDepth :: LineCat -> Int 246 blockDepth (TEXT d) = d 247 blockDepth (OPEN d _) = d 248 blockDepth VOID = 0 249 blockDepth (NOTE d) = d 250 blockDepth (SING d) = d 251 252 absoluteDepth :: LineCat -> Int 253 absoluteDepth (TEXT d) = d 254 absoluteDepth (OPEN _ d) = d 255 absoluteDepth VOID = 0 256 absoluteDepth (NOTE d) = d 257 absoluteDepth (SING d) = d 258 259 blockStep 260 :: BlockState 261 -> Maybe [Lexeme] 262 -> (BlockState, [[[Lexeme]]]) 263 blockStep (BS fn lno st) inp = case inp of 264 Nothing -> 265 (BS fn lno WOODS,) $ 266 case st of 267 WOODS -> [] 268 TEXTY{ls} -> [reverse ls] 269 BLOCK{ls} -> [reverse ls] 270 271 Just l -> 272 let 273 c = lineCat l 274 x s o = (BS fn (lno+1) s, o) 275 bd = blockDepth c 276 ad = absoluteDepth c 277 breakIt ls = (reverse ls :) <$> blockStep (BS fn lno WOODS) inp 278 in case (st, c) of 279 (WOODS, TEXT{}) -> x (TEXTY [l] bd) [] 280 (WOODS, OPEN{}) -> x (BLOCK [l] bd (ad+1)) [] 281 (WOODS, VOID) -> x WOODS [] 282 (WOODS, NOTE{}) -> x WOODS [] 283 (WOODS, SING{}) -> x WOODS [[l]] 284 285 (TEXTY{..}, TEXT d) | d==deep -> x TEXTY{ls=l:ls,..} [] 286 (TEXTY{ls}, _) -> breakIt ls 287 288 -- If we see a blank line, but the previous line was not 289 -- blank, then don't break the block yet. We will decide 290 -- once we see the next line. 291 -- 292 -- However, if the previous line was blank too (depth = -1), 293 -- then this is a double-blank line, so we break here. 294 (BLOCK{ls,deep,prev}, VOID) -> 295 if prev > 0 296 then x BLOCK{deep,prev=0,ls} [] 297 else breakIt ls 298 299 -- If we already forgave a blank line, but we see another 300 -- input that maximally unindented, then break before 301 -- this line. 302 (BLOCK{ls,prev=0}, _) | ad==0 -> 303 (reverse ls :) <$> blockStep (BS fn lno WOODS) inp 304 305 -- If we see an input that's indented enough to be included 306 -- in this block, then include it in the block and remember 307 -- it's depth. 308 -- 309 -- However, if it's *not* indented enough to be included in 310 -- the block, break before that line. 311 (BLOCK{ls,deep}, _) -> 312 if bd >= deep 313 then x BLOCK{deep,prev=ad+1,ls=(l:ls)} [] 314 else breakIt ls 315 316 317 -- Take Lines, Spit out Trees -------------------------------------------------- 318 319 spanFrag :: Span (Lexi a) -> Frag (Span (Lexi a)) 320 spanFrag s@S{x=RUNE} = WOLF (spanBar s) s 321 spanFrag s = LAMB s 322 323 spanBar :: Span a -> ByteString 324 spanBar s = take (s.end - s.off) $ drop s.off s.lin.byt 325 326 clumpFrag :: Clump -> (Int, Frag Clump) 327 clumpFrag c@(C cs) = 328 case cs.x of 329 _ :| (_:_) -> (cs.off, LAMB c) 330 e :| [] -> case spanFrag e of 331 WOLF rune _ -> (cs.off + length rune - 1, WOLF rune c) 332 LAMB _ -> (cs.off, LAMB c) 333 334 blockState :: FilePath -> BlockState 335 blockState fn = BS fn 1 WOODS 336 337 treeStep :: BlockState -> Maybe ByteString -> (BlockState, [(Int, [Tree])]) 338 treeStep st@(BS fil num _) mInp = (st2, treeOut) 339 where 340 (st2, out) = blockStep st (mInp <&> \byt -> lex (LN fil num byt)) 341 treeOut = out <&> \bls -> 342 ( (unsafeHead (unsafeHead bls)).t.lin.num 343 , layout $ fmap clumpFrag $ clump $ multiLine $ concat bls 344 )