about summary refs log tree commit diff
path: root/lex.c
diff options
context:
space:
mode:
Diffstat (limited to 'lex.c')
-rw-r--r--lex.c279
1 files changed, 279 insertions, 0 deletions
diff --git a/lex.c b/lex.c
new file mode 100644
index 0000000..29b56f0
--- /dev/null
+++ b/lex.c
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2026 Nakidai Perumenei <nakidai at disroot dot org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "thac.h"
+
+#include <ctype.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+static struct tok storage;
+struct tok curtok;
+static int isunget;
+ulong curline;
+
+static int
+isidish(int ch)
+{
+	static char lut[256] = {
+		['a'] = 1, ['b'] = 1, ['c'] = 1, ['d'] = 1, ['e'] = 1, ['f'] = 1,
+		['g'] = 1, ['h'] = 1, ['i'] = 1, ['j'] = 1, ['k'] = 1, ['l'] = 1,
+		['m'] = 1, ['n'] = 1, ['o'] = 1, ['p'] = 1, ['q'] = 1, ['r'] = 1,
+		['s'] = 1, ['t'] = 1, ['u'] = 1, ['v'] = 1, ['w'] = 1, ['x'] = 1,
+		['y'] = 1, ['z'] = 1, ['A'] = 1, ['B'] = 1, ['C'] = 1, ['D'] = 1,
+		['E'] = 1, ['F'] = 1, ['G'] = 1, ['H'] = 1, ['I'] = 1, ['J'] = 1,
+		['K'] = 1, ['L'] = 1, ['M'] = 1, ['N'] = 1, ['O'] = 1, ['P'] = 1,
+		['Q'] = 1, ['R'] = 1, ['S'] = 1, ['T'] = 1, ['U'] = 1, ['V'] = 1,
+		['W'] = 1, ['X'] = 1, ['Y'] = 1, ['Z'] = 1, ['@'] = 1, ['_'] = 1,
+		['0'] = 1, ['1'] = 1, ['2'] = 1, ['3'] = 1, ['4'] = 1, ['5'] = 1,
+		['6'] = 1, ['7'] = 1, ['8'] = 1, ['9'] = 1, ['\''] = 1,
+	};
+
+	if (ch < 0 || ch > 255)
+		return 0;
+
+	return lut[ch];
+}
+
+static void
+append(int ch)
+{
+	if (curtok.info.cap < curtok.info.len + 1)
+		curtok.info.p = realloc(
+			curtok.info.p,
+			curtok.info.cap = (curtok.info.cap + sizeof(*curtok.info.p)) * 2
+		);
+	if (!curtok.info.p)
+		dieno(1, "realloc()");
+	curtok.info.p[curtok.info.len++] = ch;
+}
+
+static int
+get(void)
+{
+	int ch;
+
+	if ((ch = getc(curfile)) == EOF)
+		return -1;
+	append(ch);
+
+	curline += (ch == '\n');
+
+	return ch;
+}
+
+static int
+unget(int ch)
+{
+	--curtok.info.len;
+	curline -= (ch == '\n');
+	return ungetc(ch, curfile);
+}
+
+void
+word(void)
+{
+	char *s;
+	int ch;
+
+	while ((ch = get()) != EOF)
+		if (!isidish(ch))
+			break;
+	if (ch != EOF)
+		unget(ch);
+	append('\0');
+
+	s = curtok.info.p;
+
+	curtok.type = TID;
+#define is(wanted) !strcmp(s, (wanted))
+	curtok.type = isdigit(*s)    ? TNUM :
+	              is("assert")   ? TKEYASSERT :
+	              is("if")       ? TKEYIF :
+	              is("else")     ? TKEYELSE :
+	              is("for")      ? TKEYFOR :
+	              is("foreach")  ? TKEYFOREACH :
+	              is("len")      ? TKEYLEN :
+	              is("mod")      ? TKEYMOD :
+	              is("node")     ? TKEYNODE :
+	              is("with")     ? TKEYWITH :
+	              is("break")    ? TKEYBREAK :
+	              is("continue") ? TKEYCONT :
+		               TID;
+#undef is
+
+	return;
+}
+
+void
+oper(void)
+{
+	int ch;
+
+	switch (ch = get())
+	{
+	/* these are not start of any other operator */
+#define SINGLE(ch, t) case (ch): curtok.type = (t); append('\0'); return
+	SINGLE('{', TOPBRACE);
+	SINGLE('}', TCLBRACE);
+	SINGLE('[', TOPBRACK);
+	SINGLE(']', TCLBRACK);
+	SINGLE('(', TOPPAREN);
+	SINGLE(')', TCLPAREN);
+	SINGLE(':', TCOL);
+	SINGLE(',', TCOMMA);
+	SINGLE('?', TQUEST);
+	SINGLE(';', TSEMICOL);
+	SINGLE('~', TTILDE);
+#undef SINGLE
+	/*
+	 * these are cases when operator is one of:
+	 * t1  = {ch1}
+	 * t21 = {ch1}{ch21}
+	 * t22 = {ch1}{ch22}
+	 */
+#define DOUBLE(ch1, ch21, ch22, t1, t21, t22) case (ch1): \
+{ \
+	int next; \
+	curtok.type = (t1); \
+	if ((next = get()) == EOF) \
+		(void)0; \
+	else if (next == (ch21)) \
+		curtok.type = (t21); \
+	else if (next == (ch22)) \
+		curtok.type = (t22); \
+	else \
+		unget(next); \
+	append('\0'); \
+	return; \
+}
+	DOUBLE('&', '=', '&', TAMPER, TASSAMPER, TAND);
+	DOUBLE('^', '=', EOF, TCARET, TASSCARET, 0);
+	DOUBLE('=', '=', EOF, TASSIGN, TEQ, 0);
+	DOUBLE('|', '=', '|', TPIPE, TASSPIPE, TOR);
+	DOUBLE('/', '=', EOF, TSLASH, TASSSLASH, 0);
+	DOUBLE('%', '=', EOF, TPERC, TASSPERC, 0);
+	DOUBLE('!', '=', EOF, TEXCLAM, TNEQ, 0);
+	DOUBLE('-', '=', '-', TMINUS, TASSMINUS, TDECR);
+	DOUBLE('+', '=', '+', TPLUS, TASSPLUS, TINCR);
+	DOUBLE('*', '=', '*', TASTER, TASSASTER, TPOW);
+#undef DOUBLE
+	/*
+	 * these are cases when operator is one of:
+	 * t1  = {ch1}
+	 * t21 = {ch1}{ch21}
+	 * t22 = {ch1}{ch22}
+	 * t23 = {ch1}{ch1}
+	 * t3  = {ch1}{ch1}{ch3}
+	 */
+#define TRIPLE(ch1, ch21, ch22, ch3, t1, t21, t22, t23, t3) case (ch1): \
+{ \
+	int next; \
+	curtok.type = (t1); \
+	if ((next = get()) == EOF) \
+		(void)0; \
+	else if (next == (ch21)) \
+		curtok.type = (t21); \
+	else if (next == (ch22)) \
+		curtok.type = (t22); \
+	else if (next == ch) \
+	{ \
+		curtok.type = (t23); \
+		if ((next = get()) == EOF) \
+			(void)0; \
+		else if (next == (ch3)) \
+			curtok.type = (t3); \
+		else \
+			unget(next); \
+	} \
+	else \
+		unget(next); \
+	append('\0'); \
+	return; \
+}
+	TRIPLE('>', '=', '<', '=', TGREAT, TGREATEQ, TCONCAT, TRSHIFT, TASSRSHIFT);
+	TRIPLE('<', '=', '-', '=', TLESS, TLESSEQ, TARRLEFT, TLSHIFT, TASSLSHIFT);
+#undef TRIPLE
+	}
+
+	complain(1, "unknown operator starting with %c", ch);
+}
+
+enum tok_t
+gettok(void)
+{
+	int ch;
+
+	curtok.info = (struct tinfo){NULL, 0, 0};
+
+	if (isunget)
+	{
+		isunget = 0;
+		curtok = storage;
+		/*say("reread %s(`%s')", tokname(curtok.type), curtok.info.p)*/;
+		return curtok.type;
+	}
+
+	for (; (ch = get()) != EOF; curtok.info.len = 0)
+		if (!isspace(ch))
+			goto found;
+	return curtok.type = TEOF;
+found:
+	unget(ch);
+
+	if (isidish(ch))
+		word();
+	else
+		oper();
+
+	/*say("read %s(`%s')", tokname(curtok.type), curtok.info.p)*/;
+	return curtok.type;
+}
+
+void
+ungettok()
+{
+	storage = curtok;
+	isunget = 1;
+}
+
+int
+_exptok(enum tok_t first, ...)
+{
+	enum tok_t next;
+	va_list ap;
+	int res;
+
+	if (gettok() == TEOF)
+		return 0;
+	if (curtok.type == first)
+		return 1;
+
+	res = 0;
+	va_start(ap, first);
+	while ((next = va_arg(ap, enum tok_t)) != TEOF)
+		if (curtok.type == next)
+		{
+			res = 1;
+			break;
+		}
+	va_end(ap);
+
+	return res;
+}