From 62a484d317ed872c3e226b21120d4e8a41e9906e Mon Sep 17 00:00:00 2001
From: transistor <trans@jabberwocky.ca>
Date: Sun, 15 May 2022 20:44:36 -0700
Subject: [PATCH] Modified the parser to work on the entire input

Previously it was going line by line, but that makes it hard to
properly parse multiline comments, so I modified it to include line
terminators in the token stream.  I also added parsing of /* */
and | \n comment types.  There is still a problem with line numbers
in the post-parsing phases, but they seem correct in the parser/lexer
stage.  It's still not able to parse the syscall.s file from Computie
but it's mostly just issues with named constants preceeded by a
"#" or "-" character.  As for the encoding stage, it has a problem
with a move instruction that uses a label.
---
 src/bin/m68kas.rs          | 19 +++++---
 src/cpus/m68k/assembler.rs | 66 +++++++++++++++++--------
 src/parser.rs              | 99 +++++++++++++++++++++++++++++++-------
 3 files changed, 138 insertions(+), 46 deletions(-)

diff --git a/src/bin/m68kas.rs b/src/bin/m68kas.rs
index 8ee47e7..b6c3431 100644
--- a/src/bin/m68kas.rs
+++ b/src/bin/m68kas.rs
@@ -11,12 +11,17 @@ fn main() {
     let filename = env::args().nth(1).unwrap();
     let text = fs::read_to_string(filename).unwrap();
 
-    let words = assembler.assemble_words(&text).unwrap();
-
-    println!("Output:");
-    for word in words.iter() {
-        print!("{:04x} ", word);
-    }
-    println!("");
+    match assembler.assemble_words(&text) {
+        Ok(words) => {
+            println!("Output:");
+            for word in words.iter() {
+                print!("{:04x} ", word);
+            }
+            println!("");
+        },
+        Err(err) => {
+            println!("{}", err.msg);
+        },
+    };
 }
  
diff --git a/src/cpus/m68k/assembler.rs b/src/cpus/m68k/assembler.rs
index 555073c..9aa596f 100644
--- a/src/cpus/m68k/assembler.rs
+++ b/src/cpus/m68k/assembler.rs
@@ -31,6 +31,7 @@ pub enum Disallow {
     NoARegImmediateOrPC         = 0x0702,
     NoRegsPrePostOrImmediate    = 0x011B,
     NoImmediateOrPC             = 0x0700,
+    OnlyAReg                    = 0x07FD,
 }
 
 impl Disallow {
@@ -113,18 +114,8 @@ impl M68kAssembler {
     }
 
     fn parse(&mut self, text: &str) -> Result<Vec<(usize, AssemblyLine)>, Error> {
-        let mut output = vec![];
-        let iter = text.split_terminator("\n");
-
-        for (lineno, line_text) in iter.enumerate() {
-            let mut parser = AssemblyParser::new(lineno, line_text);
-            let parsed_line = parser.parse_line()?;
-            if let Some(line) = parsed_line {
-                output.push((lineno, line));
-            }
-        }
-
-        Ok(output)
+        let mut parser = AssemblyParser::new(text);
+        parser.parse()
     }
 
     fn apply_relocations(&mut self) -> Result<(), Error> {
@@ -212,12 +203,18 @@ impl M68kAssembler {
         let operation_size = get_size_from_mneumonic(mneumonic).ok_or_else(|| Error::new(&format!("error at line {}: expected a size specifier (b/w/l)", lineno)));
         match &mneumonic[..mneumonic.len() - 1] {
 
-            "addi" | "addai" => {
+            "addi" => {
                 self.convert_common_immediate_instruction(lineno, 0x0600, args, operation_size?, Disallow::NoARegImmediateOrPC)?;
             },
-            "add" | "adda" => {
+            "addai" => {
+                self.convert_common_immediate_instruction(lineno, 0x0600, args, operation_size?, Disallow::OnlyAReg)?;
+            },
+            "add" => {
                 self.convert_common_dreg_instruction(lineno, 0xD000, args, operation_size?, Disallow::None)?;
             },
+            "adda" => {
+                self.convert_common_areg_instruction(lineno, 0xD000, args, operation_size?, Disallow::None)?;
+            },
             "andi" => {
                 if !self.check_convert_flags_instruction(lineno, 0x23C, 0x27C, args)? {
                     self.convert_common_immediate_instruction(lineno, 0x0200, args, operation_size?, Disallow::NoARegImmediateOrPC)?;
@@ -291,12 +288,18 @@ impl M68kAssembler {
                 self.convert_common_shift_instruction(lineno, mneumonic, 0xE010, args, operation_size?)?;
             },
 
-            "subi" | "subai" => {
+            "subi" => {
                 self.convert_common_immediate_instruction(lineno, 0x0400, args, operation_size?, Disallow::NoARegImmediateOrPC)?;
             },
-            "sub" | "suba" => {
+            "subai" => {
+                self.convert_common_immediate_instruction(lineno, 0x0400, args, operation_size?, Disallow::OnlyAReg)?;
+            },
+            "sub" => {
                 self.convert_common_dreg_instruction(lineno, 0x9000, args, operation_size?, Disallow::None)?;
             },
+            "suba" => {
+                self.convert_common_areg_instruction(lineno, 0x9000, args, operation_size?, Disallow::None)?;
+            },
 
             // TODO complete remaining instructions
             _ => return Err(Error::new(&format!("unrecognized instruction at line {}: {:?}", lineno, mneumonic))),
@@ -315,8 +318,16 @@ impl M68kAssembler {
     }
 
     fn convert_common_dreg_instruction(&mut self, lineno: usize, opcode: u16, args: &[AssemblyOperand], operation_size: Size, disallow: Disallow) -> Result<(), Error> {
+        self.convert_common_reg_instruction(lineno, opcode, args, operation_size, disallow, Disallow::NoAReg)
+    }
+
+    fn convert_common_areg_instruction(&mut self, lineno: usize, opcode: u16, args: &[AssemblyOperand], operation_size: Size, disallow: Disallow) -> Result<(), Error> {
+        self.convert_common_reg_instruction(lineno, opcode, args, operation_size, disallow, Disallow::NoDReg)
+    }
+
+    fn convert_common_reg_instruction(&mut self, lineno: usize, opcode: u16, args: &[AssemblyOperand], operation_size: Size, disallow: Disallow, disallow_reg: Disallow) -> Result<(), Error> {
         expect_args(lineno, args, 2)?;
-        let (direction, reg, operand) = convert_reg_and_other(lineno, args, Disallow::NoAReg)?;
+        let (direction, reg, operand) = convert_reg_and_other(lineno, args, disallow_reg)?;
         let (effective_address, additional_words) = convert_target(lineno, operand, operation_size, disallow)?;
         self.output.push(opcode | encode_size(operation_size) | direction | (reg << 9) | effective_address);
         self.output.extend(additional_words);
@@ -419,12 +430,13 @@ fn convert_target(lineno: usize, operand: &AssemblyOperand, size: Size, disallow
                     if name.starts_with("a") {
                         let reg = expect_reg_num(lineno, name)?;
                         return Ok(((0b100 << 3) | reg, vec![]));
+                    } else if name == "sp" {
+                        return Ok((0b100111, vec![]));
                     }
                 }
             }
             Err(Error::new(&format!("error at line {}: pre-decrement operator can only be used with a single address register", lineno)))
         },
-        // TODO complete remaining types
         _ => Err(Error::new(&format!("not implemented: {:?}", operand))),
     }
 }
@@ -474,7 +486,19 @@ fn convert_indirect(lineno: usize, args: &[AssemblyOperand], disallow: Disallow)
                 Ok(((0b101 << 3) | reg, convert_immediate(lineno, *offset, Size::Word)?))
             }
         },
-        // TODO add the index register mode
+        &[AssemblyOperand::Immediate(offset), AssemblyOperand::Register(name), AssemblyOperand::Register(index)] => {
+            let index_reg = expect_reg_num(lineno, index)?;
+            let da_select = if index.starts_with("a") { 1 << 15 } else { 0 };
+            if name == "pc" {
+                disallow.check(lineno, Disallow::NoPCRelativeIndex)?;
+                Ok((0b111011, vec![da_select | (index_reg << 12) | ((*offset as u16) & 0xff)]))
+            } else {
+                disallow.check(lineno, Disallow::NoIndirectIndexReg)?;
+                let reg = expect_address_reg_num(lineno, name)?;
+                Ok(((0b110 << 3) | reg, vec![da_select | (index_reg << 12) | ((*offset as u16) & 0xff)]))
+            }
+        },
+        // TODO add the MC68020 address options
         _ => {
             Err(Error::new(&format!("error at line {}: expected valid indirect addressing mode, but found {:?}", lineno, args)))
         }
@@ -498,14 +522,14 @@ fn convert_reg_and_other<'a>(lineno: usize, args: &'a [AssemblyOperand], disallo
 fn convert_immediate(lineno: usize, value: usize, size: Size) -> Result<Vec<u16>, Error> {
     match size {
         Size::Byte => {
-            if value < u8::MAX as usize {
+            if value <= u8::MAX as usize {
                 Ok(vec![value as u16])
             } else {
                 Err(Error::new(&format!("error at line {}: immediate number is out of range; must be less than {}, but number is {:?}", lineno, u8::MAX, value)))
             }
         },
         Size::Word => {
-            if value < u16::MAX as usize {
+            if value <= u16::MAX as usize {
                 Ok(vec![value as u16])
             } else {
                 Err(Error::new(&format!("error at line {}: immediate number is out of range; must be less than {}, but number is {:?}", lineno, u16::MAX, value)))
diff --git a/src/parser.rs b/src/parser.rs
index 0588278..a7a454c 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -28,16 +28,32 @@ pub struct AssemblyParser<'input> {
 }
 
 impl<'input> AssemblyParser<'input> {
-    pub fn new(lineno: usize, input: &'input str) -> Self {
+    pub fn new(input: &'input str) -> Self {
         Self {
-            lexer: AssemblyLexer::new(lineno, input),
+            lexer: AssemblyLexer::new(input),
         }
     }
 
-    pub fn parse_line(&mut self) -> Result<Option<AssemblyLine>, Error> {
-        let token = match self.lexer.get_next() {
-            Some(token) => token,
-            None => return Ok(None),
+    pub fn parse(&mut self) -> Result<Vec<(usize, AssemblyLine)>, Error> {
+        let mut output = vec![];
+        loop {
+            let lineno = self.lexer.get_next_lineno();
+            if let Some(line) = self.parse_line()? {
+                output.push((lineno, line));
+            } else {
+                break;
+            }
+        }
+        Ok(output)
+    }
+
+    fn parse_line(&mut self) -> Result<Option<AssemblyLine>, Error> {
+        let token = loop {
+            match self.lexer.get_next() {
+                Some(token) if token == "\n" => { },
+                Some(token) => { break token; }
+                None => { return Ok(None); },
+            }
         };
 
         let result = match token.as_str() {
@@ -67,12 +83,21 @@ impl<'input> AssemblyParser<'input> {
 
     fn parse_list_of_words(&mut self) -> Result<Vec<String>, Error> {
         let mut list = vec![];
+
+        // If we're already at the end of the line, then it's an empty list, so return
+        let next = self.lexer.peek();
+        if next.is_none() || next.as_ref().unwrap() == "\n" {
+            return Ok(list);
+        }
+
         loop {
             list.push(self.lexer.expect_next()?);
+
             let next = self.lexer.peek();
-            if next.is_none() || next.unwrap() != "," {
+            if next.is_none() || next.as_ref().unwrap() != "," {
                 return Ok(list);
             }
+            self.lexer.expect_next()?;
         }
     }
 
@@ -81,7 +106,7 @@ impl<'input> AssemblyParser<'input> {
 
         // If we're already at the end of the line, then it's an empty list, so return
         let next = self.lexer.peek();
-        if next.is_none() {
+        if next.is_none() || next.as_ref().unwrap() == "\n" {
             return Ok(list);
         }
 
@@ -160,9 +185,9 @@ pub struct AssemblyLexer<'input> {
 }
 
 impl<'input> AssemblyLexer<'input> {
-    pub fn new(lineno: usize, input: &'input str) -> Self {
+    pub fn new(input: &'input str) -> Self {
         Self {
-            lineno,
+            lineno: 1,
             chars: input.chars().peekable(),
             peeked: None,
         }
@@ -172,6 +197,11 @@ impl<'input> AssemblyLexer<'input> {
         self.lineno
     }
 
+    pub fn get_next_lineno(&mut self) -> usize {
+        self.eat_whitespace();
+        self.lineno
+    }
+
     pub fn get_next(&mut self) -> Option<String> {
         if self.peeked.is_some() {
             let result = std::mem::replace(&mut self.peeked, None);
@@ -214,20 +244,53 @@ impl<'input> AssemblyLexer<'input> {
     }
 
     pub fn expect_end(&mut self) -> Result<(), Error> {
-        if let Some(token) = self.get_next() {
-            Err(Error::new(&format!("expected end of line at {}: found {:?}", self.lineno, token)))
-        } else {
+        let token = self.get_next();
+        if token.is_none() || token.as_ref().unwrap() == "\n" {
             Ok(())
+        } else {
+            Err(Error::new(&format!("expected end of line at {}: found {:?}", self.lineno, token)))
         }
     }
 
     fn eat_whitespace(&mut self) {
-        while self.chars.next_if(|ch| is_whitespace(*ch)).is_some() { }
-    }
-}
+        while let Some(ch) = self.chars.peek() {
+            if *ch == '|' {
+                self.read_until('\n')
+            } else if *ch == '/' {
+                self.chars.next();
+                if self.chars.next_if(|ch| *ch == '*').is_some() {
+                    loop {
+                        self.read_until('*');
+                        self.chars.next();
+                        if self.chars.next_if(|ch| *ch == '/').is_some() {
+                            break;
+                        }
+                    }
+                } else {
 
-fn is_whitespace(ch: char) -> bool {
-    ch == ' ' || ch == '\n' || ch == '\t'
+                }
+            } else if *ch == ' ' || *ch == '\t' || *ch == '\r' {
+                self.chars.next();
+            } else {
+                if *ch == '\n' {
+                    self.lineno += 1;
+                }
+                break;
+            }
+        }
+    }
+
+    fn read_until(&mut self, test: char) {
+        while let Some(ch) = self.chars.peek() {
+            if *ch == test {
+                return;
+            }
+            if *ch == '\n' {
+                self.lineno += 1;
+            }
+            self.chars.next();
+        }
+    }
 }
 
 fn is_word(ch: char) -> bool {