contiki/apps/webbrowser/htmlparser.c

/*
 * Copyright (c) 2002, Adam Dunkels.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above
 *    copyright notice, this list of conditions and the following
 *    disclaimer in the documentation and/or other materials provided
 *    with the distribution.
 * 3. The name of the author may not be used to endorse or promote
 *    products derived from this software without specific prior
 *    written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * This file is part of the Contiki desktop environment
 *
 * $Id: htmlparser.c,v 1.8 2007/11/30 21:53:50 oliverschmidt Exp $
 *
 */

/* htmlparser.c:
 *
 * Implements a very simplistic HTML parser. It recognizes HTML links
 * (<a href>-tags), HTML img alt tags, a few text flow break tags
G * (<br>, <p>, <h>), the <li> tag (but does not even try to
 * distinguish between <ol> or <ul>) as well as HTML comment tags
 * (<!-- -->).
 *
 * To save memory, the HTML parser is state machine driver, which
 * means that it will shave off one character from the HTML page,
 * process that character, and return to the next. Another way of
 * doing it would be to buffer a number of characters and process them
 * together.
 *
 * The main function in this file is the htmlparser_parse() function
 * which takes a htmlparser_state structur and a part of an HTML file
 * as an argument. The htmlparser_parse() function will call the
 * helper functions parse_char() and parse_tag(). Those functions will
 * in turn call the two callback functions htmlparser_char() and
 * htmlparser_tag(). Those functions must be implemented by the using
 * module (e.g., a web browser program).
 *
 * htmlparser_char() will be called for every non-tag character.
 *
 * htmlparser_tag() will be called whenever a full tag has been found.
 *
 */

#include <string.h>

#include "contiki.h"
#include "html-strings.h"
#include "www.h"

#include "htmlparser.h"

#if 1
#define PRINTF(x)
#else
#include <stdio.h>
#define PRINTF(x) printf x
#endif


/*-----------------------------------------------------------------------------------*/
#define ISO_A     0x41
#define ISO_B     0x42
#define ISO_E     0x45
#define ISO_F     0x46
#define ISO_G     0x47
#define ISO_H     0x48
#define ISO_I     0x49
#define ISO_L     0x4c
#define ISO_M     0x4d
#define ISO_P     0x50
#define ISO_R     0x52
#define ISO_T     0x54

#define ISO_a     (ISO_A | 0x20)
#define ISO_b     (ISO_B | 0x20)
#define ISO_e     (ISO_E | 0x20)
#define ISO_f     (ISO_F | 0x20)
#define ISO_g     (ISO_G | 0x20)
#define ISO_h     (ISO_H | 0x20)
#define ISO_i     (ISO_I | 0x20)
#define ISO_l     (ISO_L | 0x20)
#define ISO_m     (ISO_M | 0x20)
#define ISO_p     (ISO_P | 0x20)
#define ISO_r     (ISO_R | 0x20)
#define ISO_t     (ISO_T | 0x20)

#define ISO_ht    0x09
#define ISO_nl    0x0a
#define ISO_cr    0x0d
#define ISO_space 0x20
#define ISO_bang  0x21
#define ISO_citation 0x22
#define ISO_ampersand 0x26
#define ISO_citation2 0x27
#define ISO_asterisk 0x2a
#define ISO_dash  0x2d
#define ISO_slash 0x2f
#define ISO_semicolon  0x3b
#define ISO_lt    0x3c
#define ISO_eq    0x3d
#define ISO_gt    0x3e

#define ISO_rbrack 0x5b
#define ISO_lbrack 0x5d

#define MINORSTATE_NONE           0
#define MINORSTATE_TEXT           1 /* Parse normal text */
#define MINORSTATE_EXTCHAR        2 /* Check for semi-colon */
#define MINORSTATE_TAG            3 /* Check for name of tag. */
#define MINORSTATE_TAGEND         4 /* Scan for end of tag. */
#define MINORSTATE_TAGATTR        5 /* Parse tag attr. */
#define MINORSTATE_TAGATTRSPACE   6 /* Parse optional space after tag
				       attr. */
#define MINORSTATE_TAGATTRPARAM   7 /* Parse tag attr parameter. */
#define MINORSTATE_TAGATTRPARAMNQ 8 /* Parse tag attr parameter without
				  quotation marks. */
#define MINORSTATE_HTMLCOMMENT    9 /* Scan for HTML comment end */

#define MAJORSTATE_NONE       0
#define MAJORSTATE_BODY       1
#define MAJORSTATE_LINK       2
#define MAJORSTATE_FORM       3
#define MAJORSTATE_DISCARD    4


struct htmlparser_state {

  unsigned char minorstate;
  char tag[20];
  unsigned char tagptr;
  char tagattr[20];
  unsigned char tagattrptr;
  char tagattrparam[WWW_CONF_MAX_URLLEN];
  unsigned char tagattrparamptr;
  unsigned char lastchar, quotechar;
  unsigned char majorstate, lastmajorstate;
  char linkurl[WWW_CONF_MAX_URLLEN];

  char word[WWW_CONF_WEBPAGE_WIDTH];
  unsigned char wordlen;

#if WWW_CONF_FORMS
  char formaction[WWW_CONF_MAX_FORMACTIONLEN];
  char formname[WWW_CONF_MAX_FORMNAMELEN];
  unsigned char inputtype;
  char inputname[WWW_CONF_MAX_INPUTNAMELEN];
  char inputvalue[WWW_CONF_MAX_INPUTVALUELEN];
  unsigned char inputvaluesize;
#endif /* WWW_CONF_FORMS */
};

static struct htmlparser_state s;

/*-----------------------------------------------------------------------------------*/
static char last[1] = {(char)0xff};

static const char *tags[] = {
#define TAG_FIRST       0
#define TAG_SLASHA      0
  html_slasha,
#define TAG_SLASHCENTER 1
  html_slashcenter,
#define TAG_SLASHFORM   2
  html_slashform,
#define TAG_SLASHH      3
  html_slashh,
#define TAG_SLASHSCRIPT 4
  html_slashscript,
#define TAG_SLASHSELECT 5
  html_slashselect,
#define TAG_SLASHSTYLE  6
  html_slashstyle,
#define TAG_A           7
  html_a,
#define TAG_BODY        8
  html_body,
#define TAG_BR          9
  html_br,
#define TAG_CENTER     10
  html_center,
#define TAG_FORM       11
  html_form,
#define TAG_FRAME      12
  html_frame,
#define TAG_H1         13
  html_h1,
#define TAG_H2         14
  html_h2,
#define TAG_H3         15
  html_h3,
#define TAG_H4         16
  html_h4,
#define TAG_IMG        17
  html_img,
#define TAG_INPUT      18
  html_input,
#define TAG_LI         19
  html_li,
#define TAG_P          20
  html_p,
#define TAG_SCRIPT     21
  html_script,
#define TAG_SELECT     22
  html_select,
#define TAG_STYLE      23
  html_style,
#define TAG_TR         24
  html_tr,
#define TAG_LAST       25
  last,
};

/*-----------------------------------------------------------------------------------*/
static unsigned char CC_FASTCALL
iswhitespace(char c)
{
  return (c == ISO_space ||
	  c == ISO_nl ||
	  c == ISO_cr ||
	  c == ISO_ht);
}
/*-----------------------------------------------------------------------------------*/
void
htmlparser_init(void)
{
  s.majorstate = s.lastmajorstate = MAJORSTATE_DISCARD;
  s.minorstate = MINORSTATE_TEXT;
  s.lastchar = 0;
}
/*-----------------------------------------------------------------------------------*/
static char CC_FASTCALL
lowercase(char c)
{
  /* XXX: This is a *brute force* approach to lower-case
     converting and should *not* be used anywhere else! It
     works for our purposes, however (i.e., HTML tags). */
  if(c > 0x40) {
    return (c & 0x1f) | 0x60;
  } else {
    return c;
  }
}
/*-----------------------------------------------------------------------------------*/
static void
endtagfound(void)
{
  s.tag[s.tagptr] = 0;
  s.tagattr[s.tagattrptr] = 0;
  s.tagattrparam[s.tagattrparamptr] = 0;
}
/*-----------------------------------------------------------------------------------*/
static void CC_FASTCALL
switch_majorstate(unsigned char newstate)
{
  if(s.majorstate != newstate) {
    PRINTF(("Switching state from %d to %d (%d)\n", s.majorstate, newstate, s.lastmajorstate));
    s.lastmajorstate = s.majorstate;
    s.majorstate = newstate;
  }
}
/*-----------------------------------------------------------------------------------*/
static void CC_FASTCALL
add_char(unsigned char c)
{
  if(s.wordlen < WWW_CONF_WEBPAGE_WIDTH - 1 && c < 0x80) {
    s.word[s.wordlen] = c;
    ++s.wordlen;
  }
}
/*-----------------------------------------------------------------------------------*/
static void
do_word(void)
{
  if(s.wordlen > 0) {
    if(s.majorstate == MAJORSTATE_LINK) {
      if(s.word[s.wordlen] != ISO_space) {
	add_char(ISO_space);
      }
    } else if(s.majorstate == MAJORSTATE_DISCARD) {
      s.wordlen = 0;
    } else {
      s.word[s.wordlen] = '\0';
      htmlparser_word(s.word, s.wordlen);
      s.wordlen = 0;
    }
  }
}
/*-----------------------------------------------------------------------------------*/
static void
newline(void)
{
  do_word();
  htmlparser_newline();
}
/*-----------------------------------------------------------------------------------*/
static unsigned char CC_FASTCALL
find_tag(char *tag)
{
  static unsigned char first, last, i, tabi;
  static char tagc;

  first = TAG_FIRST;
  last = TAG_LAST;
  i = 0;

  do {
    tagc = tag[i];

    if(tagc == 0 &&
       tags[first][i] == 0) {
      return first;
    }

    tabi = first;

    /* First, find first matching tag from table. */
    while(tagc > (tags[tabi])[i] &&
	  tabi < last) {
      ++tabi;
    }
    first = tabi;

    /* Second, find last matching tag from table. */
    while(tagc == (tags[tabi])[i] &&
	  tabi < last) {
      ++tabi;
    }
    last = tabi;

    /* If first and last matching tags are equal, we have a non-match
       and return. Else we continue with the next character. */
    ++i;

  } while(last != first);
  return TAG_LAST;
}
/*-----------------------------------------------------------------------------------*/
static void
parse_tag(void)
{
  static char *tagattrparam;
  static unsigned char size;

  static char dummy;

  PRINTF(("Parsing tag '%s' '%s' '%s'\n",
	  s.tag, s.tagattr, s.tagattrparam));

  switch(find_tag(s.tag)) {
  case TAG_P:
  case TAG_H1:
  case TAG_H2:
  case TAG_H3:
  case TAG_H4:
    /*    parse_char(ISO_nl);*/
    newline();
    /* FALLTHROUGH */
  case TAG_BR:
  case TAG_TR:
  case TAG_SLASHH:
    /*    parse_char(ISO_nl);*/
    dummy = 0;
    newline();
    break;
  case TAG_LI:
    newline();
    add_char(ISO_asterisk);
    add_char(ISO_space);
    break;
  case TAG_SCRIPT:
  case TAG_STYLE:
  case TAG_SELECT:
    switch_majorstate(MAJORSTATE_DISCARD);
    break;
  case TAG_SLASHSCRIPT:
  case TAG_SLASHSTYLE:
  case TAG_SLASHSELECT:
    do_word();
    switch_majorstate(s.lastmajorstate);
    break;
  case TAG_BODY:
    s.majorstate = s.lastmajorstate = MAJORSTATE_BODY;
    break;
  case TAG_FRAME:
    if(strncmp(s.tagattr, html_src, sizeof(html_src)) == 0 &&
       s.tagattrparam[0] != 0) {
      switch_majorstate(MAJORSTATE_BODY);
      newline();
      add_char(ISO_rbrack);
      do_word();
      htmlparser_link((char *)html_frame, (unsigned char)strlen(html_frame), s.tagattrparam);
      PRINTF(("Frame [%s]\n", s.tagattrparam));
      add_char(ISO_lbrack);
      newline();
    }
    break;
  case TAG_IMG:
    if(strncmp(s.tagattr, html_alt, sizeof(html_alt)) == 0 &&
       s.tagattrparam[0] != 0) {
      /*      parse_char(ISO_lt);*/
      add_char(ISO_lt);
      tagattrparam = &s.tagattrparam[0];
      while(*tagattrparam) {
	/*	parse_char(*tagattrparam);*/
	add_char(*tagattrparam);
	++tagattrparam;
      }
      /*      parse_char(ISO_gt);*/
      add_char(ISO_gt);
      do_word();
    }
    break;
  case TAG_A:
    PRINTF(("A %s %s\n", s.tagattr, s.tagattrparam));
    if(strncmp(s.tagattr, html_href, sizeof(html_href)) == 0 &&
       s.tagattrparam[0] != 0) {
      strcpy(s.linkurl, s.tagattrparam);
      do_word();
      switch_majorstate(MAJORSTATE_LINK);
    }
    break;
  case TAG_SLASHA:
    if(s.majorstate == MAJORSTATE_LINK) {
      switch_majorstate(s.lastmajorstate);
      s.word[s.wordlen] = 0;
      htmlparser_link(s.word, s.wordlen, s.linkurl);
      s.wordlen = 0;
    }
    break;
#if WWW_CONF_FORMS
  case TAG_FORM:
    PRINTF(("Form tag\n"));
    switch_majorstate(MAJORSTATE_FORM);
    if(strncmp(s.tagattr, html_action, sizeof(html_action)) == 0) {
      PRINTF(("Form action '%s'\n", s.tagattrparam));
      strncpy(s.formaction, s.tagattrparam, WWW_CONF_MAX_FORMACTIONLEN - 1);
    } else if(strncmp(s.tagattr, html_name, sizeof(html_name)) == 0) {
      PRINTF(("Form name '%s'\n", s.tagattrparam));
      strncpy(s.formname, s.tagattrparam, WWW_CONF_MAX_FORMNAMELEN - 1);
    }
    s.inputname[0] = s.inputvalue[0] = 0;
    break;
  case TAG_SLASHFORM:
    switch_majorstate(MAJORSTATE_BODY);
    s.formaction[0] = s.formname[0] = 0;
    break;
  case TAG_INPUT:
    if(s.majorstate == MAJORSTATE_FORM) {
      /* First check if we are called at the end of an input tag. If
	 so, we should render the input widget. */
      if(s.tagattr[0] == 0 &&
	 s.inputname[0] != 0) {
	PRINTF(("Render input type %d\n", s.inputtype));
	switch(s.inputtype) {
	case HTMLPARSER_INPUTTYPE_NONE:
	case HTMLPARSER_INPUTTYPE_TEXT:
	  s.inputvalue[s.inputvaluesize] = 0;
	  htmlparser_inputfield(s.inputvaluesize, s.inputvalue, s.inputname,
				s.formname, s.formaction);
	  break;
	case HTMLPARSER_INPUTTYPE_SUBMIT:
	case HTMLPARSER_INPUTTYPE_IMAGE:
	  htmlparser_submitbutton(s.inputvalue, s.inputname,
				  s.formname, s.formaction);
	  break;
	}
	s.inputtype = HTMLPARSER_INPUTTYPE_NONE;
      } else {
	PRINTF(("Input '%s' '%s'\n", s.tagattr, s.tagattrparam));
	if(strncmp(s.tagattr, html_type, sizeof(html_type)) == 0) {
	  if(strncmp(s.tagattrparam, html_submit,
		     sizeof(html_submit)) == 0) {
	    s.inputtype = HTMLPARSER_INPUTTYPE_SUBMIT;
	  } else if(strncmp(s.tagattrparam, html_image,
			    sizeof(html_image)) == 0) {
	    s.inputtype = HTMLPARSER_INPUTTYPE_IMAGE;
	  } else if(strncmp(s.tagattrparam, html_text,
			    sizeof(html_text)) == 0) {
	    s.inputtype = HTMLPARSER_INPUTTYPE_TEXT;
	  } else {
	    s.inputtype = HTMLPARSER_INPUTTYPE_OTHER;
	  }
	} else if(strncmp(s.tagattr, html_name,
			  sizeof(html_name)) == 0) {
	  strncpy(s.inputname, s.tagattrparam,
		  WWW_CONF_MAX_INPUTNAMELEN);
	} else if(strncmp(s.tagattr, html_alt,
			  sizeof(html_alt)) == 0 &&
		  s.inputtype == HTMLPARSER_INPUTTYPE_IMAGE) {
	  strncpy(s.inputvalue, s.tagattrparam,
		  WWW_CONF_MAX_INPUTVALUELEN);
	} else if(strncmp(s.tagattr, html_value,
			  sizeof(html_value)) == 0) {
	  strncpy(s.inputvalue, s.tagattrparam,
		  WWW_CONF_MAX_INPUTVALUELEN);
	} else if(strncmp(s.tagattr, html_size,
			  sizeof(html_size)) == 0) {
	  size = 0;
	  if(s.tagattrparam[0] >= '0' &&
	     s.tagattrparam[0] <= '9') {
	    size = s.tagattrparam[0] - '0';
	    if(s.tagattrparam[1] >= '0' &&
	       s.tagattrparam[1] <= '9') {
	      size = size * 10 + (s.tagattrparam[1] - '0');
	    }
	  }
	  if(size >= WWW_CONF_MAX_INPUTVALUELEN) {
	    size = WWW_CONF_MAX_INPUTVALUELEN - 1;
	  }
	  s.inputvaluesize = size;
	  /*	  strncpy(s.inputvalue, s.tagattrparam,
		  WWW_CONF_MAX_INPUTVALUELEN);*/
	}
      }

    }
    break;
#endif /* WWW_CONF_FORMS */
#if WWW_CONF_RENDERSTATE
  case TAG_CENTER:
    /*    parse_char(ISO_nl);    */
    newline();
    htmlparser_renderstate(HTMLPARSER_RENDERSTATE_BEGIN |
			   HTMLPARSER_RENDERSTATE_CENTER);
    break;
  case TAG_SLASHCENTER:
    /*    parse_char(ISO_nl);*/
    newline();
    htmlparser_renderstate(HTMLPARSER_RENDERSTATE_END |
			   HTMLPARSER_RENDERSTATE_CENTER);
    break;
#endif /* WWW_CONF_RENDERSTATE */
  }
}
/*-----------------------------------------------------------------------------------*/
static uint16_t
parse_word(char *data, uint8_t dlen)
{
  static uint8_t i;
  static uint8_t len;
  unsigned char c;

  len = dlen;

  switch(s.minorstate) {
  case MINORSTATE_TEXT:
    for(i = 0; i < len; ++i) {
      c = data[i];
      if(iswhitespace(c)) {
	do_word();
      } else if(c == ISO_lt) {
	s.minorstate = MINORSTATE_TAG;
	s.tagptr = 0;
	/*	do_word();*/
	break;
      } else if(c == ISO_ampersand) {
	s.minorstate = MINORSTATE_EXTCHAR;
	break;
      } else {
	add_char(c);
      }
    }
    break;
  case MINORSTATE_EXTCHAR:
    for(i = 0; i < len; ++i) {
      c = data[i];
      if(c == ISO_semicolon) {
	s.minorstate = MINORSTATE_TEXT;
	add_char(' ');
	break;
      } else if(iswhitespace(c)) {
	s.minorstate = MINORSTATE_TEXT;
	add_char('&');
	add_char(' ');
	break;
      }
    }
    break;
  case MINORSTATE_TAG:
    /* We are currently parsing within the name of a tag. We check
       for the end of a tag (the '>' character) or whitespace (which
       indicates that we should parse a tag attr argument
       instead). */
    for(i = 0; i < len; ++i) {
      c = data[i];
      if(c == ISO_gt) {
	/* Full tag found. We continue parsing regular text. */
	s.minorstate = MINORSTATE_TEXT;
	s.tagattrptr = s.tagattrparamptr = 0;
	endtagfound();
	parse_tag();
	break;
      } else if(iswhitespace(c)) {
	/* The name of the tag found. We continue parsing the tag
	   attr.*/
	s.minorstate = MINORSTATE_TAGATTR;
	s.tagattrptr = 0;
	endtagfound();
	break;
      } else {
	/* Keep track of the name of the tag, but convert it to
	   lower case. */

	s.tag[s.tagptr] = lowercase(c);
	++s.tagptr;
	/* Check if the ->tag field is full. If so, we just eat up
	   any data left in the tag. */
	if(s.tagptr == sizeof(s.tag)) {
	  s.minorstate = MINORSTATE_TAGEND;
	  break;
	}
      }

      /* Check for HTML comment, indicated by <!-- */
      if(s.tagptr == 3 &&
	 s.tag[0] == ISO_bang &&
	 s.tag[1] == ISO_dash &&
	 s.tag[2] == ISO_dash) {
	PRINTF(("Starting comment...\n"));
	s.minorstate = MINORSTATE_HTMLCOMMENT;
	s.tagptr = 0;
	endtagfound();
	break;
      }
    }
    break;
  case MINORSTATE_TAGATTR:
    /* We parse the "tag attr", i.e., the "href" in <a
       href="...">. */
    for(i = 0; i < len; ++i) {
      c = data[i];
      if(c == ISO_gt) {
	/* Full tag found. */
	s.minorstate = MINORSTATE_TEXT;
	s.tagattrparamptr = 0;
	s.tagattrptr = 0;
	endtagfound();
	parse_tag();
	s.tagptr = 0;
	endtagfound();
	break;
      } else if(iswhitespace(c)) {
	if(s.tagattrptr == 0) {
	  /* Discard leading spaces. */
	} else {
	  /* A non-leading space is the end of the attribute. */
	  s.tagattrparamptr = 0;
	  endtagfound();
	  parse_tag();
	  s.minorstate = MINORSTATE_TAGATTRSPACE;
	  break;
	  /*	    s.tagattrptr = 0;
		    endtagfound();*/
	}
      } else if(c == ISO_eq) {
	s.minorstate = MINORSTATE_TAGATTRPARAMNQ;
	s.tagattrparamptr = 0;
	endtagfound();
	break;
      } else {
	s.tagattr[s.tagattrptr] = lowercase(c);
	++s.tagattrptr;
	/* Check if the "tagattr" field is full. If so, we just eat
	   up any data left in the tag. */
	if(s.tagattrptr == sizeof(s.tagattr)) {
	  s.minorstate = MINORSTATE_TAGEND;
	  break;
	}
      }
    }
    break;
  case MINORSTATE_TAGATTRSPACE:
    for(i = 0; i < len; ++i) {
      c = data[i];
      if(iswhitespace(c)) {
	/* Discard spaces. */
      } else if(c == ISO_eq) {
	s.minorstate = MINORSTATE_TAGATTRPARAMNQ;
	s.tagattrparamptr = 0;
	endtagfound();
	parse_tag();
	break;
      } else {
	s.tagattr[0] = lowercase(c);
	s.tagattrptr = 1;
	s.minorstate = MINORSTATE_TAGATTR;
	break;
      }
    }
    break;
  case MINORSTATE_TAGATTRPARAMNQ:
    /* We are parsing the "tag attr parameter", i.e., the link part
       in <a href="link">. */
    for(i = 0; i < len; ++i) {
      c = data[i];
      if(c == ISO_gt) {
	/* Full tag found. */
	endtagfound();
	parse_tag();
	s.minorstate = MINORSTATE_TEXT;
	s.tagattrptr = 0;
	endtagfound();
	parse_tag();
	s.tagptr = 0;
	endtagfound();
	break;
      } else if(iswhitespace(c) &&
		s.tagattrparamptr == 0) {
	/* Discard leading spaces. */
      } else if((c == ISO_citation ||
		 c == ISO_citation2) &&
		s.tagattrparamptr == 0) {
	s.minorstate = MINORSTATE_TAGATTRPARAM;
	s.quotechar = c;
	PRINTF(("tag attr param q found\n"));
	break;
      } else if(iswhitespace(c)) {
	PRINTF(("Non-leading space found at %d\n",
		s.tagattrparamptr));
	/* Stop parsing if a non-leading space was found */
	endtagfound();
	parse_tag();

	s.minorstate = MINORSTATE_TAGATTR;
	s.tagattrptr = 0;
	endtagfound();
	break;
      } else {
	s.tagattrparam[s.tagattrparamptr] = c;
	++s.tagattrparamptr;
	/* Check if the "tagattr" field is full. If so, we just eat
	   up any data left in the tag. */
	if(s.tagattrparamptr >= sizeof(s.tagattrparam) - 1) {
	  s.minorstate = MINORSTATE_TAGEND;
	  break;
	}
      }
    }
    break;
  case MINORSTATE_TAGATTRPARAM:
    /* We are parsing the "tag attr parameter", i.e., the link
       part in <a href="link">. */
    for(i = 0; i < len; ++i) {
      c = data[i];
      if(c == s.quotechar) {
	/* Found end of tag attr parameter. */
	endtagfound();
	parse_tag();

	s.minorstate = MINORSTATE_TAGATTR;
	s.tagattrptr = 0;
	endtagfound();
	break;
      } else {
	if(iswhitespace(c)) {
	  s.tagattrparam[s.tagattrparamptr] = ISO_space;
	} else {
	  s.tagattrparam[s.tagattrparamptr] = c;
	}

	++s.tagattrparamptr;
	/* Check if the "tagattr" field is full. If so, we just eat
	   up any data left in the tag. */
	if(s.tagattrparamptr >= sizeof(s.tagattrparam) - 1) {
	  s.minorstate = MINORSTATE_TAGEND;
	  break;
	}
      }
    }
    break;
  case MINORSTATE_HTMLCOMMENT:
    for(i = 0; i < len; ++i) {
      c = data[i];
      if(c == ISO_dash) {
	++s.tagptr;
      } else if(c == ISO_gt && s.tagptr > 0) {
	PRINTF(("Comment done.\n"));
	s.minorstate = MINORSTATE_TEXT;
	break;
      } else {
	s.tagptr = 0;
      }
    }
    break;
  case MINORSTATE_TAGEND:
    /* Discard characters until a '>' is seen. */
    for(i = 0; i < len; ++i) {
      if(data[i] == ISO_gt) {
	s.minorstate = MINORSTATE_TEXT;
	s.tagattrptr = 0;
	endtagfound();
	parse_tag();
	break;
      }
    }
    break;
  default:
    i = 0;
    break;
  }
  if(i >= len) {
    return len;
  }
  return i + 1;
}
/*-----------------------------------------------------------------------------------*/
void
htmlparser_parse(char *data, uint16_t datalen)
{
  uint16_t plen;

  while(datalen > 0) {
    if(datalen > 255) {
      plen = parse_word(data, 255);
    } else {
      plen = parse_word(data, (uint8_t)datalen);
    }
    datalen -= plen;
    data += plen;
  }
}
/*-----------------------------------------------------------------------------------*/