(ns robotwar.assembler
  (:use (clojure [string :only [split join]])))

(def op-commands    [ "-" "+" "*" "/" "=" "#" "<" ">" ])
(def word-commands  [ "TO" "IF" "GOTO" "GOSUB" "ENDSUB" ])

(def commands (concat op-commands word-commands))

(defn re-seq-with-pos
  "Returns a lazy sequence of successive matches of pattern in string with position.
  Largely stolen from re-seq source."
  [^java.util.regex.Pattern re s]
  (let [m (re-matcher re s)]
    ((fn step []
       (when (.find m)
         (cons [(re-groups m) (.start m)] (lazy-seq (step))))))))

(defn split-lines
  [raw-lines]
  (split raw-lines #"\n"))

(defn strip-comments
  [lines]
  (map #(re-find #"[^;]*" %) lines))

(def lex-re
  (let [op-string (join op-commands)]
    (re-pattern (str "[" op-string "]|[^" op-string "\\s]+"))))

(defn lex-line
  "Helper function for lex. Note: metadata fields :line and :pos
  are intended to be human-readable for error-reporting
  purposes, so they're indexed from 1."
  [line-num line]
  (map (fn [[s n]]
         ^{:line (inc line-num), :pos (inc n)} {:token-str s})
       (re-seq-with-pos lex-re line)))

(defn lex
  "Lexes a sequence of lines into a sequence of sequences of tokens
  (referred to in docstrings for parsing functions as lines of tokens)."
  [lines]
  (map-indexed lex-line lines))

(defn str->int
  "Integer/parseInt, but returns nil on failure"
  [s-raw]
  (try (Integer/parseInt s-raw)
       (catch Exception e nil)))

(defn valid-word
  "Capital letters and numbers, starting with a capital letter"
  [s]
  (re-matches #"^[A-Z][A-Z\d]*" s))

(def return-err (constantly "Invalid word or symbol"))

(defn parse-token
  "parses a token with a token-str field.
  needs to work with the original token map by using dissoc and into
  (rather than building a new one) because it contains line and column
  number metadata."
  [{token-str :token-str :as token}]
  (let [parser-priority
        [[(set commands)   :command]
         [str->int         :number]
         [valid-word       :identifier]
         [return-err       :error]]]
    (some
      (fn [[parser token-type]]
        (when-let [token-val (parser token-str)]
          (dissoc (into token {:val token-val, :type token-type})
                  :token-str)))
      parser-priority)))

(defn parse-line
  "takes a line of tokens and runs each token through parse-token for the first
  pass of determining its type. Then parse-line further divides :identifier
  tokens into two types: :label if it's the only thing on its line or it follows
  a 'GOTO' or a 'GOSUB', and :register otherwise.
  If we encounter an error, just return the token, not a sequence of tokens."
  [initial-tokens]
  (loop [[token & tail :as tokens] initial-tokens
         parsed-tokens []]
    (if (empty? tokens)
      parsed-tokens
      (let [{token-type :type token-val :val :as parsed-token} (parse-token token)]
        (case token-type
         :error parsed-token
         (:command :number) (recur tail (conj parsed-tokens parsed-token))
         :identifier (if (or (= (count initial-tokens) 1)
                             (#{"GOTO" "GOSUB"} (:val (last parsed-tokens))))
                       (recur tail (conj parsed-tokens (assoc parsed-token :type :label)))
                       (recur tail (conj parsed-tokens (assoc parsed-token :type :register)))))))))

(defn parse
  "take the lines of tokens and converts them to :val and :type format.
  After this point, tokens are no longer separated into sequences of sequences
  according to the linebreaks in the original source code --
  if we need that information later for error reporting, it's in the metadata.
  if there's an error, this function just returns the token,
  outside of any sequence."
  [initial-token-lines]
  (loop [[token-line & tail :as token-lines] initial-token-lines
         parsed-token-lines []]
    (if (empty? token-lines)
      parsed-token-lines
      (let [parsed-line (parse-line token-line)]
        (if (= (:type parsed-line) :error)
          parsed-line
          (recur tail (concat parsed-token-lines parsed-line)))))))

(defn disambiguate-minus-signs
  [initial-tokens]
  (loop [tokens initial-tokens
         results []]
    (let [{prev-type :type} (last results)
          [{current-val :val :as current-token}
           & [{next-val :val, next-type :type :as next-token} :as tail]] tokens]
      (cond
        (empty? tokens) results
        (and (= current-val "-")
             (= next-type :number)
             (not (#{:number :register} prev-type)))
        (recur (rest tail)
               (conj results (into current-token {:val (- next-val), :type :number})))
        :otherwise (recur tail (conj results current-token))))))

(defn make-instr-pairs
  "Compiles the tokens into token-pairs. Commands consume the next token.
  When values are encountered that are not arguments to commands,
  a special token-pair is created that is a comma followed by the value
  (meaning push the value into the accumulator). The comma command re-uses
  the same :line and :pos metadata from the token containing the value that is being pushed."
  [initial-tokens]
  (loop [[token & tail :as tokens] initial-tokens
         result []]
    (if (empty? tokens)
      result
      (let [{:keys [type val]} token]
        (cond
          (or (= type :number) (= type :register))
            (recur tail (conj result [(into token {:val ",", :type :command}) token]))
          (or (= type :label) (and (= type :command) (= val "ENDSUB")))
            (recur tail (conj result [token nil]))
          (= type :command)
            (recur (rest tail) (conj result [token (first tail)])))))))


; TODO: preserve :line and :pos metadata with labels,
; when labels are transferred from the instruction list to the label map

(defn map-labels
  "Maps label-names to their appropriate indexes in the instruction list,
  and remove the labels from the instruction list itself (except as targets)"
  [initial-instrs]
  (loop [[instr & tail :as instrs] initial-instrs
         result {:labels {}
                 :instrs []}]
    (if (empty? instrs)
      result
      (let [command (first instr)
            next-instr-num (count (result :instrs))]
        (if (= (command :type) :label)
          (recur tail (assoc-in result [:labels (command :val)] next-instr-num))
          (recur tail (assoc-in result [:instrs next-instr-num] instr)))))))

(defn assemble [src-code]
  "compiles robotwar code, with error-checking beginning after the lexing
  step. All functions that return errors will return a map with the keyword
  :error, and then a token with a :val field containing the error string,
  and metadata containing :pos and :line fields containing the location.
  So far only parse implements error-checking."
  (let [lexed (-> src-code split-lines strip-comments lex)]
    (reduce (fn [result step]
              (if (= (:type result) :error)
                result
                (step result)))
            lexed
            [parse disambiguate-minus-signs make-instr-pairs map-labels])))