gopher/url.c
2012-04-06 20:13:48 -04:00

585 lines
12 KiB
C

#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include "url.h"
enum {
kScheme,
kUser,
kPassword,
kHost,
kPort,
kPath,
kParams,
kQuery,
kFragment
};
/*
* scheme://username:password@domain:port/path;parms?query_string#fragment_id
*
*
*/
int URLComponentGetC(const char *url, URLComponents *components, int type, char *dest)
{
URLRange *rangePtr;
URLRange r;
if (!url || !components) return -1;
if (type < URLComponentScheme || type > URLComponentPathAndQuery) return -1;
rangePtr = &components->scheme;
r = rangePtr[type];
if (!dest) return r.length;
else
{
if (r.length)
{
memcpy(dest, url + r.location, r.length);
}
dest[r.length] = 0;
return r.length;
}
}
// pstring.
int URLComponentGet(const char *url, URLComponents *components, int type, char *dest)
{
URLRange *rangePtr;
URLRange r;
if (!url || !components) return -1;
if (type < URLComponentScheme || type > URLComponentPathAndQuery) return -1;
rangePtr = &components->scheme;
r = rangePtr[type];
if (!dest) return r.length;
else if (r.length > 255) return -1;
else
{
dest[0] = r.length;
if (r.length)
{
memcpy(dest + 1, url + r.location, r.length);
}
return r.length;
}
}
static void parseScheme(const char *cp, unsigned size, URLComponents *c)
{
unsigned *wp;
unsigned h;
if (!c) return;
if (!cp || !size)
{
c->portNumber = 0;
c->schemeType = SCHEME_NONE;
return;
}
wp = (unsigned *)cp;
h = (*cp ^ size) & 0x0f;
switch(h)
{
// --- begin auto-generated --
case 0:
// ssh
if (size == 3
&& (wp[0] | 0x2020) == 'ss'
&& (cp[2] | 0x20) == 'h'
)
{
c->schemeType = SCHEME_SSH;
c->portNumber = 22;
return;
}
break;
case 1:
// gopher
if (size == 6
&& (wp[0] | 0x2020) == 'og'
&& (wp[1] | 0x2020) == 'hp'
&& (wp[2] | 0x2020) == 're'
)
{
c->schemeType = SCHEME_GOPHER;
c->portNumber = 70;
return;
}
break;
case 2:
// file, telnet, afp
if (size == 4
&& (wp[0] | 0x2020) == 'if'
&& (wp[1] | 0x2020) == 'el'
)
{
c->schemeType = SCHEME_FILE;
c->portNumber = 0;
return;
}
if (size == 6
&& (wp[0] | 0x2020) == 'et'
&& (wp[1] | 0x2020) == 'nl'
&& (wp[2] | 0x2020) == 'te'
)
{
c->schemeType = SCHEME_TELNET;
c->portNumber = 23;
return;
}
if (size == 3
&& (wp[0] | 0x2020) == 'fa'
&& (cp[2] | 0x20) == 'p'
)
{
c->schemeType = SCHEME_AFP;
c->portNumber = 548;
return;
}
break;
case 5:
// ftp
if (size == 3
&& (wp[0] | 0x2020) == 'tf'
&& (cp[2] | 0x20) == 'p'
)
{
c->schemeType = SCHEME_FTP;
c->portNumber = 21;
return;
}
break;
case 7:
// sftp
if (size == 4
&& (wp[0] | 0x2020) == 'fs'
&& (wp[1] | 0x2020) == 'pt'
)
{
c->schemeType = SCHEME_SFTP;
c->portNumber = 115;
return;
}
break;
case 10:
// nntp
if (size == 4
&& (wp[0] | 0x2020) == 'nn'
&& (wp[1] | 0x2020) == 'pt'
)
{
c->schemeType = SCHEME_NNTP;
c->portNumber = 119;
return;
}
break;
case 12:
// http
if (size == 4
&& (wp[0] | 0x2020) == 'th'
&& (wp[1] | 0x2020) == 'pt'
)
{
c->schemeType = SCHEME_HTTP;
c->portNumber = 80;
return;
}
break;
case 13:
// https, nfs
if (size == 5
&& (wp[0] | 0x2020) == 'th'
&& (wp[1] | 0x2020) == 'pt'
&& (cp[4] | 0x20) == 's'
)
{
c->schemeType = SCHEME_HTTPS;
c->portNumber = 443;
return;
}
if (size == 3
&& (wp[0] | 0x2020) == 'fn'
&& (cp[2] | 0x20) == 's'
)
{
c->schemeType = SCHEME_NFS;
c->portNumber = 2049;
return;
}
break;
// --- end auto-generated --
}
c->portNumber = 0;
c->schemeType = SCHEME_UNKNOWN;
}
int ParseURL(const char *url, int length, struct URLComponents *components)
{
char c;
int state;
int hasScheme;
int hasAt;
int i, l;
URLRange range = { 0, 0 };
URLRange *rangePtr;
if (!components || !url) return 0;
rangePtr = &components->scheme;
memset(components, 0, sizeof(URLComponents));
state = kScheme;
// 1 check for a scheme.
// [A-Za-z0-9.+-]+ ':'
hasScheme = 0;
hasAt = 0;
range.location = 0;
for(i = 0; i < length; ++i)
{
c = url[i];
if (c == ':')
{
hasScheme = 1;
break;
}
if (c >= 'a' && c <= 'z') continue;
if (c >= 'A' && c <= 'Z') continue;
if (c >= '0' && c <= '9') continue;
//if (isalnum(c)) continue;
if (c == '.') continue;
if (c == '+') continue;
if (c == '-') continue;
break;
}
if (hasScheme)
{
if (i == 0) return 0;
range.length = i;
components->scheme = range;
parseScheme(url, i, components);
++i; // skip the ':'
}
else
{
state = kPath;
i = 0;
}
// 2. check for //
// //user:password@domain:port/
// otherwise, it's a path
if (strncmp(url + i, "//", 2) == 0)
{
state = kUser;
i += 2;
}
else
{
state = kPath;
}
//range = (URLRange){ i, 0 };
range.location = i; range.length = 0;
for( ; i < length; ++i)
{
c = url[i];
if ((c < 32) || (c & 0x80))
{
// goto invalid_char;
return 0;
}
switch(c)
{
// illegal chars.
// % # not allowed but allowed.
case ' ':
case '"':
case '<':
case '>':
case '[':
case '\\':
case ']':
case '^':
case '`':
case '{':
case '|':
case '}':
case 0x7f:
// goto invalid_char;
return 0;
break;
case ':':
// username -> password
// domain -> port
// password -> error
// port -> error
// all others, no transition.
switch(state)
{
case kUser:
case kHost:
l = i - range.location;
if (l > 0)
{
range.length = l;
rangePtr[state] = range;
}
//range = (URLRange){ i + 1 , 0 };
range.location = i + 1; range.length = 0;
state++;
break;
case kPassword:
case kPort:
return 0;
}
break;
case '@':
// username -> domain
// password -> domain
// all others no transition.
// path -> error?
switch(state)
{
case kUser:
case kPassword:
hasAt = 1;
l = i - range.location;
if (l > 0)
{
range.length = l;
rangePtr[state] = range;
}
//range = (URLRange){ i + 1 , 0 };
range.location = i + 1; range.length = 0;
state = kHost;
break;
}
break;
case '/':
// domain -> path
// port -> path
// (must be switched to domain:port)
// username -> path
// password -> path
// all others, no transition.
// '/' is part of the filename.
switch (state)
{
case kUser:
hasAt = 1; // close enough
state = kHost; // username was actually the domain.
// pass through.
case kPassword: // need to switch to domain:port later.
case kHost:
case kPort:
l = i - range.location;
if (l > 0)
{
range.length = l;
rangePtr[state] = range;
}
//range = (URLRange){ i, 0 }; // '/' is part of path.
range.location = i; range.length = 0;
state = kPath;
break;
}
break;
case ';':
// path -> param
// all others, no transition.
if (state == kPath)
{
l = i - range.location;
if (l > 0)
{
range.length = l;
rangePtr[state] = range;
}
//range = (URLRange){ i + 1 , 0 };
range.location = i + 1; range.length = 0;
state = kParams;
}
break;
case '?':
// path -> query
// param -> query
// all others, no transition.
if (state == kPath || state == kParams)
{
l = i - range.location;
if (l > 0)
{
range.length = l;
rangePtr[state] = range;
}
//range = (URLRange){ i + 1 , 0 };
range.location = i + 1; range.length = 0;
state = kQuery;
}
break;
case '#':
// fragment -> no transition
// everything else transitions to a fragment.
// can immediately end parsing since we know the length...
l = i - range.location;
if (l > 0)
{
range.length = l;
rangePtr[state] = range;
}
state = kFragment;
range.location = i + 1; range.length = 0;
i = length; // break from loop.
break;
}
}
// finish up the last portion.
if (state)
{
l = i - range.location;
if (l > 0)
{
range.length = l;
rangePtr[state] = range;
}
}
if (!hasAt)
{
// user:name was actually domain:port
components->host = components->user;
components->port = components->password;
components->user.location = 0;
components->user.length = 0;
components->password.location = 0;
components->password.length = 0;
}
// port number conversion.
if (components->port.location)
{
const char *tmp = url + components->port.location;
int p = 0;
l = components->port.length;
for (i = 0; i < l; ++i)
{
c = tmp[i];
if ((c < '0') || (c > '9'))
{
p = 0;
break;
}
// convert to number.
c &= 0x0f;
// p *= 10;
// 10x = 2x + 8x = (x << 1) + (x << 3)
if (p)
{
int p2;
p2 = p << 1;
p = p2 << 2;
p = p + p2;
}
p += c;
}
components->portNumber = p;
}
#if 0
// path and query.
// path;params?query
range = components->path;
if (range.length)
{
if (components->params.length)
range.length += components->params.length + 1;
if (components->query.length)
range.length += components->query.length + 1;
components->pathAndQuery = range;
}
#endif
return 1;
}