robotwar/src/robotwar/foundry.clj

(ns robotwar.foundry
  (:require [robotwar.kernel-lexicon])
  (:use (clojure [string :only [split join]]
                 [pprint :only [pprint]])
        [clojure.core.match :only [match]]))

(defn re-seq-with-pos
  "Returns a lazy sequence of successive matches of pattern in string with position.
  Largely stolen from re-seq source."
  [^java.util.regex.Pattern re s]
  (let [m (re-matcher re s)]
    ((fn step []
       (when (.find m)
         (cons [(re-groups m) (.start m)] (lazy-seq (step))))))))

(defn split-lines
  [raw-lines]
  (split raw-lines #"\n"))

(defn strip-comments
  [lines]
  (map #(re-find #"[^;]*" %) lines))

(def lex-re
  (let [op-string (join robotwar.kernel-lexicon/op-commands)]
    (re-pattern (str "[" op-string "]|[^" op-string "\\s]+"))))

(defn lex-line
  "Helper function for lex. Note: metadata fields :line and :pos
  are intended to be human-readable for error-reporting
  purposes, so they're indexed from 1."
  [line-num line]
  (map (fn [[s n]]
         ^{:line (inc line-num), :pos (inc n)} {:token-str s})
       (re-seq-with-pos lex-re line)))

(defn lex
  "Lexes a sequence of lines. After this point, tokens
  are no longer grouped by line (line numbers have been
  captured in metadata, along with column numbers)."
  [lines]
  (apply concat (map-indexed lex-line lines)))

(defn str->int
  "Integer/parseInt, but returns nil on failure"
  [s-raw]
  (try (Integer/parseInt s-raw)
       (catch Exception e nil)))

(defn valid-word
   "Capital letters and numbers, starting with a capital letter"
  [s]
  (re-matches #"^[A-Z][A-Z\d]*" s))

(def return-err (constantly "Invalid word or symbol"))

(defn parse-token
  "takes a vector of reg-names and a token with a token-str field and parses the token.
  needs to work with the original token map by using dissoc and into
  (rather than building a new one) because it contains line and column
  number metadata."
  [reg-names {token-str :token-str :as token}]
  (let [parser-priority
        [[(set reg-names)  :register]
         [(set robotwar.kernel-lexicon/commands) :command]
         [str->int         :number]
         [valid-word       :label]
         [return-err       :error]]]
    (some
      (fn [[parser token-type]]
        (when-let [token-val (parser token-str)]
          (dissoc (into token {:val token-val, :type token-type})
                  :token-str)))
        parser-priority)))

(defn parse
  "take the tokens and convert them to structured source code ready for compiling.
  if there's an error, returns a different type: just the token,
  outside of any sequence."
  [reg-names initial-tokens]
  (loop [[token & tail :as tokens] initial-tokens
         parsed-tokens []]
    (if (empty? tokens)
      parsed-tokens
      (let [{token-type :type :as parsed-token} (parse-token reg-names token)]
        (if (= token-type :error)
          parsed-token
          (recur tail (conj parsed-tokens parsed-token)))))))

(defn disambiguate-minus-signs
  [initial-tokens]
  (loop [tokens initial-tokens
         results []]
    (let [{prev-type :type} (last results)
          [{current-val :val :as current-token}
           & [{next-val :val, next-type :type :as next-token} :as tail]] tokens]
      (cond
        (empty? tokens) results
        (and (= current-val "-")
             (= next-type :number)
             (not (#{:number :register} prev-type)))
          (recur (rest tail)
                 (conj results (into current-token {:val (- next-val), :type :number})))
        :otherwise (recur tail (conj results current-token))))))

(defn make-instr-pairs
  "Compiles the tokens into token-pairs. Commands consume the next token.
  When values are encountered that are not arguments to commands,
  a special token-pair is created that is a comma followed by the value
  (meaning push the value into the accumulator). The comma command re-uses
  the same :line and :pos metadata from the token containing the value that is being pushed."
  [initial-tokens]
  (loop [[token & tail :as tokens] initial-tokens
         result []]
    (if (empty? tokens)
      result
      (match [token]
        [{:type (:or :number :register)}]
          (recur tail (conj result [(into token {:val ",", :type :command}) token]))
        [(:or {:type :label} {:type :command, :val "ENDSUB"})]
          (recur tail (conj result [token nil]))
        [{:type :command}]
          (recur (rest tail) (conj result [token (first tail)]))))))

; TODO: preserve :line and :pos metadata with labels,
; when labels are transferred from the instruction list to the label map

(defn map-labels
  "Maps label-names to their appropriate indexes in the instruction list,
  and remove the labels from the instruction list itself (except as targets)"
  [initial-instrs]
  (loop [[instr & tail :as instrs] initial-instrs
         result {:labels {}
                 :instrs []}]
    (if (empty? instrs)
      result
      (let [command (first instr)
            next-instr-num (count (result :instrs))]
        (if (= (command :type) :label)
          (recur tail (assoc-in result [:labels (command :val)] next-instr-num))
          (recur tail (assoc-in result [:instrs next-instr-num] instr)))))))

(defn assemble [reg-names string]
  "compiles robotwar code, with error-checking beginning after the lexing
  step. All functions that return errors will return a map with the keyword
  :error, and then a token with a :val field containing the error string,
  and metadata containing :pos and :line fields containing the location.
  So far only parse implements error-checking."
  (let [parse-with-reg-names (partial parse reg-names)
        lexed (-> string split-lines strip-comments lex)]
   (reduce (fn [result step]
             (if (= (:type result) :error)
               result
               (step result)))
           lexed
           [parse-with-reg-names disambiguate-minus-signs make-instr-pairs map-labels])))