(*
 * Copyright (c) 2016, 2017, 2018, 2019
 *	The President and Fellows of Harvard College.
 * Written by David A. Holland.
 *
 * Copyright (c) 2020 David A. Holland.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE UNIVERSITY OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *)

(*
 * Support code for ocamllex lexers. This is stuff that nearly every
 * lexer needs and was previously getting cutpasted everywhere. Now
 * there's at least a single file to cutpaste into every project...
 *)


open Pos


(*
 * Positioning and text/value extraction.
 *)

(* track the current position *)
let curfile = ref ""
let curline = ref 0
let curcol = ref 0

(* print the current position as a string for errors *)
let getpostxt () =
   let strings =
      [!curfile; string_of_int !curline; string_of_int !curcol]
   in
   String.concat ":" strings

(* assign the current pathname *)
let setpathname lexbuf pathname =
   let lp = { lexbuf.Lexing.lex_curr_p with Lexing.pos_fname = pathname; } in
   lexbuf.Lexing.lex_curr_p <- lp;
   lexbuf

(* accept a newline *)
let nl lexbuf =
   Lexing.new_line lexbuf;
   curline := !curline + 1;
   curcol := 1

(* advance over the current token *)
let advance lexbuf =
   let len = Lexing.lexeme_end lexbuf - Lexing.lexeme_start lexbuf in
   curcol := !curcol + len

(* extract the current position as a Pos.pos *)
let getpos () =
   let l = !curline in
   let c = !curcol in
   { file = !curfile; startline=l; startcolumn=c; endline=l; endcolumn=c; }

(* for use in lexer rules: consume the current token and return its position *)
let pos lexbuf =
   let ret = getpos () in
   advance lexbuf;
   ret

(* like pos but also return the token string ("value") *)
let posval lexbuf =
   let ret = getpos () in
   advance lexbuf;
   (ret, Lexing.lexeme lexbuf)

(* for use in lexer rules: consume the current token and return its value *)
let tval lexbuf =
   advance lexbuf;
   Lexing.lexeme lexbuf

(* like tval but also apply a function to the value (e.g. int_of_string) *)
let tval' lexbuf f =
   advance lexbuf;
   f (Lexing.lexeme lexbuf)


(*
 * String accumulation buffer.
 *
 * Because it doesn't work to write a single lexer rule for a string
 * constant, it must be read as multiple lexer tokens, and they need
 * to be stored somewhere.
 *
 * stringdata holds the string being built;
 * stringstart records the position where the string started;
 * startstring starts building a new string;
 * addstring adds a single ocaml string to it;
 * addchar adds a single character;
 * getstring returns the string.
 *
 * XXX: getstring should probably also stuff stringstart into the
 * lexbuf so the parser reads the right positions out; not sure what
 * the current state is but it _looks_ like it doesn't work right and
 * the position attached to the output string constant will be the
 * position of just the close-quote.
 *)

let stringdata = ref (Buffer.create 64)
let stringstart = ref Pos.nowhere
let startstring lexbuf =
   Buffer.clear !stringdata;
   stringstart := pos lexbuf
let addstring s =
   Buffer.add_string !stringdata s
let addchar c =
   addstring (String.make 1 c)
let getstring () =
   Buffer.contents !stringdata


(*
 * identifiers and keywords
 *
 * doident' takes a mapping from keyword strings to tokens and a
 * function to construct an ordinary identifier token, and an
 * identifier string, and returns a token.
 *
 * intended usage is:
 *    let keywords = Types.stringmap_of_list [ ("kw", KW) ... ]
 *    let doident = doident' keywords (fun x -> IDENT x)
 *
 * and then in lexer rules,
 *    doident (tval lexbuf)
 *
 * This takes into account the fact that every lexer's output token
 * type is different (since it's defined by the parser) and also has a
 * different list of reserved words and tokens, but factors out as
 * much of the logic as is reasonably possible. (Note that while one
 * might in principle be able to get a tidier factoring by making
 * lextools into a functor over the parser module, I don't think it
 * actually works in ocaml. To be able to mention IDENT in lextools
 * you'd need to be able to give a partial definition of a sum type
 * belonging to the functor argument, and I'm pretty sure that isn't
 * possible.)
 *)

let doident' keywords mkident tval =
   try
      (Types.StringMap.find tval keywords)
   with Not_found ->
      mkident tval

(*
 * For invalid input
 *
 * The types for badchar pretend to return a token, so that it could
 * be changed if desired to continue on error (producing an INVALID or
 * placeholder token or the like) without changing the lexer rules.
 * (Of course, arguably it makes more sense in that case for it to
 * record failure and produce _no_ token rather than adding an INVALID
 * token to the parser... but since error printing and recovery in
 * ocamlyacc is pretty much a loss anyhow it's not worth thinking
 * about.)
 *)

(* report and choke on an invalid input character *)
let badchar (pos, tval) =
   let postxt = Pos.string_of_pos pos in
   Util.crash (postxt ^ ": Invalid input character " ^ tval)

(* report and choke on an unterminated string constant *)
let badstring pos =
   let postxt = Pos.string_of_pos pos in
   let postxt2 = Pos.string_of_pos !stringstart in
   Util.say (postxt ^ ": Unterminated string constant");
   Util.crash (postxt2 ^ ": String constant began here")

(*
 * Include file support
 *
 * Mucking with include files is messy (it is messy in every lex-style
 * lexer tool) and everything would be a lot cleaner without it,
 * except that it's commonly enough wanted or needed that it's worth
 * having the support here so it only needs to be implemented and
 * debugged once.
 *
 * The code here maintains an include path: a list of directories to
 * look for include files in. If you need more elaborate include path
 * or search logic you'll have to customize. So far, abstracting that
 * out looks horribly messy.
 *)

(* list of directories to include from *)
let includepath = ref []

(* list of files currently in progress *)
let includestack = ref []

(* add an entry to the include path (in order) *)
let addincludepath path =
   includepath := !includepath @ [path]

(* find and open an include file, or crash if not found *)
let findinclude name =
   let tryopen pathname =
      try
         Some (pathname, open_in pathname)
      with Sys_error _ -> None
   in
   let once z dir =
      match z with
      | Some _ -> z
      | None -> tryopen (Filename.concat dir name)
   in
   match List.fold_left once None !includepath with
   | Some z -> z
   | None ->
        let herepath = Filename.concat (Filename.dirname !curfile) name in
        match tryopen herepath with
        | Some z -> z
        | None ->
             let msg = "Cannot find include file " ^ name in
             Util.crash (getpostxt () ^ ": " ^ msg)

(* include a file; bypath chooses whether to use the search path *)
let includefile' bypath name =
   let (pathname, channel) =
      if bypath then findinclude name
      else (name, open_in name)
   in
   let lexbuf = setpathname (Lexing.from_channel channel) pathname in
   includestack := (lexbuf, !curfile, !curline, !curcol) :: !includestack;
   curfile := pathname;
   curline := 1;
   curcol := 1

(* include a file from the search path *)
let includefile name = includefile' true name

(* internal input widget input; eof is the EOF token; iseof a test for it *)
let includewrap eof iseof lexer fakelexbuf =
   (* transfer the position info into the lexbuf the parser sees *)
   let lbupdate reallexbuf =
      fakelexbuf.Lexing.lex_eof_reached <- reallexbuf.Lexing.lex_eof_reached;
      fakelexbuf.Lexing.lex_start_p <- reallexbuf.Lexing.lex_start_p;
      fakelexbuf.Lexing.lex_curr_p <- reallexbuf.Lexing.lex_curr_p
   in
   (* try to read from the top of the include stack, pop on eof *)
   let rec tryread () =
      match !includestack with
      | [] -> eof
      | (reallexbuf, prevfile, prevline, prevcol) :: more ->
           let tok = lexer reallexbuf in
           lbupdate reallexbuf;
           if iseof tok then begin
              begin match more with
              | [] ->
                   (*
                    * Reached the end of the original file; leave
                    * the last position in place, so if it causes a
                    * parse error we report it at the end of the
                    * outermost file instead of at the null initial
                    * position.
                    *)
                   ()
              | _ ->
                   curfile := prevfile;
                   curline := prevline;
                   curcol := prevcol
              end;
              (* pop the stack and try again *)
              includestack := more;
              tryread ()
           end
           else tok
   in
   tryread ()

(*
 * Dump support.
 *
 * Sometimes for debugging it's extremely helpful to see the token
 * stream the lexer produces. This arranges that. To use it,
 * uncomment the dump line in read'. Unfortunately this consumes
 * the token stream as well as printing it; haven't yet sorted out
 * a good way to print and also feed tokens to the parser. Probably
 * means moving the dump logic to includewrap -- without include
 * handling (when this was first set up), that hook didn't exist.
 *
 * iseof tests for a token being EOF; dumpone prints a single token.
 *)

let rec dump' iseof dumpone lexer lexbuf =
   let t = lexer lexbuf in
   if iseof t then ()
   else begin
      let txt = dumpone t in
      print_string txt;
      print_newline ();
      dump' iseof dumpone lexer lexbuf
   end

(*
 * Top level.
 *
 * base is the lexer function (the name of the toplevel lexer rule in
 * the .mll file)
 *
 * mkparser is the parser function (the name of the start token in the
 * .mly file)
 *
 * eof is the EOF token for this parser;
 * iseof tests for a token being EOF;
 *
 * dumpone is a token -> string function for debug printing;
 *
 * check is a first-pass checker for the parser output if desired
 * (otherwise use (fun x -> x))
 *
 * pathname is the file to read.
 *
 * Intended usage is:
 *    let read = read' base Parser.file EOF iseof dumpone check
 *
 * This function will either return parse results or print an error
 * and crash out.
 *
 * Note: it is likely unnecessary to have iseof (vs. just checking
 * tokens with = against eof) but it remains not entirely clear to me
 * whether ocaml's builtin = remains safe on nullary constructors for
 * types in which some constructors contain values that the builtin =
 * does not work properly with. Since we might reasonably have such
 * values in tokens (e.g. Big_int.big_int), seems best to be cautious.
 *)

let read' base mkparser eof iseof dumpone check pathname =
   let dump lexer lexbuf = dump' iseof dumpone lexer lexbuf in
   let _ = dump in (* silence unused warning *)

   (* push the file on the include stack *)
   includefile' false pathname;

   (* lexbuf for the parser; the contents should never appear *)
   let fakelexbuf = Lexing.from_string "xyzzy fnord" in

   (* construct the parser *)
   let lexer = base in
   let parser = mkparser (includewrap eof iseof lexer) in
   try
      (* uncomment to dump the token stream as described above *)
      (*dump lexer lexbuf;*)

      (* run the parser *)
      let decls = parser fakelexbuf in
      (* check the output *)
      check decls
   with Parsing.Parse_error ->
      (* If the parser chokes, print the position *)
      let msg = getpostxt () ^ ": Parse error" in
      Util.crash msg
