/* Copyright (C) 2008 Emmanuel Varoquaux
This file is part of XOS.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see . */
#include "lex.h"
#include "safe_malloc.h"
#include "error.h"
#include "vars.h"
#include
#include
#include
#include
#include
#include
/* Lecture de caracteres */
lexer_getc_func_t lexer_getc_func;
static int pushed_back_buffer[2];
static int pushed_back_pos = 0;
static int lexer_getc()
{
int c;
c = pushed_back_pos ? pushed_back_buffer[--pushed_back_pos] : lexer_getc_func();
if (c == '\n')
line_number++;
return c;
}
static void lexer_ungetc(int c)
{
if (c == '\n')
line_number--;
pushed_back_buffer[pushed_back_pos++] = c;
}
/* Analyse lexicale */
#define WORD_GROW_SIZE 256
/* etats de l'analyseur lexical */
enum {
LS_START,
LS_START_BACKSLASH,
LS_COMMENT,
/* operateur */
LS_OPERATOR,
LS_OPERATOR_BACKSLASH,
/* mot */
LS_NUMBER,
LS_NUMBER_BACKSLASH,
LS_WORD,
LS_WORD_BACKSLASH,
LS_WORD_SINGLE_QUOTE,
LS_WORD_DOUBLE_QUOTE,
LS_WORD_DOUBLE_QUOTE_BACKSLASH
};
/* etats pour la reconnaissance des operateurs */
enum {
OS_AND,
OS_BAR,
OS_GREATER,
OS_LESS
};
static int lexer_state;
static int operator_state;
static int number;
static char *word;
static size_t word_size;
static int word_pos;
static int error;
struct token_struct current_token;
static inline int is_blank(int c)
{
return isascii(c) && isblank(c);
}
static inline int is_digit(int c)
{
return isascii(c) && isdigit(c);
}
static inline int is_meta(int c)
{
return is_blank(c) || (c && strchr("|&;()<>\n", c));
}
static void init_number()
{
number = 0;
}
static void add_digit(char c)
{
number = number * 10 + (c - '0');
}
static void init_word()
{
word = safe_malloc(WORD_GROW_SIZE);
word_size = WORD_GROW_SIZE;
word_pos = 0;
}
static void word_append(char c)
{
if (word_pos + 1 == (int)word_size) {
word = safe_realloc(word, word_size + WORD_GROW_SIZE);
word_size += WORD_GROW_SIZE;
}
word[word_pos++] = c;
}
static void end_word()
{
word[word_pos] = '\0';
}
static void free_word()
{
free(word);
}
/* scanner */
static int read_lexeme()
{
int c;
int token_type;
lexer_state = LS_START;
while (1) {
c = lexer_getc();
switch (lexer_state) {
case LS_START:
if (c == EOF) {
error = 0;
return TT_END_OF_INPUT;
}
if (is_blank(c))
continue;
switch (c) {
case '\\':
lexer_state = LS_START_BACKSLASH;
continue;
case '|':
lexer_state = LS_OPERATOR;
operator_state = OS_BAR;
continue;
case '&':
lexer_state = LS_OPERATOR;
operator_state = OS_AND;
continue;
case ';':
return TT_SEMICOLON;
case '(':
return TT_LEFT_PARENTHESIS;
case ')':
return TT_RIGHT_PARENTHESIS;
case '<':
lexer_state = LS_OPERATOR;
operator_state = OS_LESS;
continue;
case '>':
lexer_state = LS_OPERATOR;
operator_state = OS_GREATER;
continue;
case '\n':
return TT_NEWLINE;
case '#':
lexer_state = LS_COMMENT;
continue;
}
init_word();
if (is_digit(c)) {
lexer_state = LS_NUMBER;
init_number();
goto add_digit;
}
else {
lexer_state = LS_WORD;
goto word;
}
case LS_START_BACKSLASH:
if (c == EOF) {
error = 0;
return TT_END_OF_INPUT;
}
if (c == '\n') {
lexer_state = LS_START;
continue;
}
lexer_state = LS_WORD_BACKSLASH;
init_word();
goto word_backslash;
case LS_COMMENT:
if (c == EOF) {
error = 0;
return TT_END_OF_INPUT;
}
if (c == '\n')
return TT_NEWLINE;
continue;
/* operateur */
case LS_OPERATOR:
if (c == '\\') {
lexer_state = LS_OPERATOR_BACKSLASH;
continue;
}
operator:
switch (operator_state) {
case OS_AND:
if (c == '&')
return TT_AND_AND;
token_type = TT_AMPERSAND;
break;
case OS_BAR:
if (c == '|')
return TT_OR_OR;
token_type = TT_BAR;
break;
case OS_GREATER:
switch (c) {
case '>':
return TT_GREATER_GREATER;
case '&':
return TT_GREATER_AND;
case '|':
return TT_GREATER_BAR;
}
token_type = TT_GREATER;
break;
case OS_LESS:
default:
switch (c) {
case '&':
return TT_LESS_AND;
case '>':
return TT_LESS_GREATER;
}
token_type = TT_LESS;
break;
}
lexer_ungetc(c);
return token_type;
case LS_OPERATOR_BACKSLASH:
if (c == '\n') {
lexer_state = LS_OPERATOR;
continue;
}
lexer_ungetc(c);
c = '\\';
goto operator;
/* mot */
case LS_NUMBER:
if (c == EOF)
return TT_WORD;
if (c == '\\') {
lexer_state = LS_NUMBER_BACKSLASH;
continue;
}
if (is_meta(c)) {
lexer_ungetc(c);
return c == '<' || c == '>' ? TT_NUMBER : TT_WORD;
}
if (!is_digit(c)) {
lexer_state = LS_WORD;
goto word;
}
goto add_digit;
case LS_NUMBER_BACKSLASH:
if (c == EOF)
return TT_NUMBER;
if (c == '\n') {
lexer_state = LS_NUMBER;
continue;
}
lexer_state = LS_WORD_BACKSLASH;
goto word_backslash;
case LS_WORD:
if (c == EOF)
return TT_WORD;
if (is_meta(c)) {
lexer_ungetc(c);
return TT_WORD;
}
word:
switch (c) {
case '\\':
lexer_state = LS_WORD_BACKSLASH;
continue;
case '\'':
lexer_state = LS_WORD_SINGLE_QUOTE;
break;
case '"':
lexer_state = LS_WORD_DOUBLE_QUOTE;
break;
}
goto word_append;
case LS_WORD_BACKSLASH:
if (c == EOF)
return TT_WORD;
if (c == '\n') {
lexer_state = LS_WORD;
continue;
}
word_backslash:
lexer_state = LS_WORD;
word_append('\\');
goto word_append;
case LS_WORD_SINGLE_QUOTE:
if (c == EOF)
goto error_unexpected_end_of_file;
if (c == '\'')
lexer_state = LS_WORD;
goto word_append;
case LS_WORD_DOUBLE_QUOTE:
if (c == EOF)
goto error_unexpected_end_of_file;
switch (c) {
case '\\':
lexer_state = LS_WORD_DOUBLE_QUOTE_BACKSLASH;
continue;
case '"':
lexer_state = LS_WORD;
break;
}
goto word_append;
case LS_WORD_DOUBLE_QUOTE_BACKSLASH:
if (c == EOF)
goto error_unexpected_end_of_file;
lexer_state = LS_WORD_DOUBLE_QUOTE;
if (c == '\n')
continue;
word_append('\\');
goto word_append;
}
add_digit:
add_digit(c);
word_append:
word_append(c);
}
error_unexpected_end_of_file:
syntax_error("unexpected end of file");
free_word();
error = 1;
return TT_END_OF_INPUT;
}
/* tokenizer */
static void make_token(int type)
{
current_token.type = type;
switch (type) {
case TT_NUMBER:
free_word();
current_token.number = number;
break;
case TT_WORD:
end_word();
current_token.word = word;
break;
case TT_END_OF_INPUT:
current_token.error = error;
break;
}
}
void lex()
{
make_token(read_lexeme());
}
void skip_line()
{
int c;
do
c = lexer_getc();
while (c != EOF && c != '\n');
if (c == EOF) {
error = 0;
make_token(TT_END_OF_INPUT);
}
else
make_token(TT_NEWLINE);
}
char *remove_quotes(const char *string)
{
char *res, *p;
int state;
char c;
res = safe_malloc(strlen(string) + 1);
p = res;
state = LS_WORD;
while ((c = *string++)) {
switch (state) {
case LS_WORD:
switch (c) {
case '\\':
state = LS_WORD_BACKSLASH;
continue;
case '\'':
state = LS_WORD_SINGLE_QUOTE;
continue;
case '"':
state = LS_WORD_DOUBLE_QUOTE;
continue;
}
break;
case LS_WORD_BACKSLASH:
state = LS_WORD;
break;
case LS_WORD_SINGLE_QUOTE:
if (c == '\'') {
state = LS_WORD;
continue;
}
break;
case LS_WORD_DOUBLE_QUOTE:
switch (c) {
case '\\':
state = LS_WORD_DOUBLE_QUOTE_BACKSLASH;
continue;
case '"':
state = LS_WORD;
continue;
}
break;
case LS_WORD_DOUBLE_QUOTE_BACKSLASH:
switch (c) {
case '"':
case '\\':
break;
default:
*p++ = '\\';
}
state = LS_WORD_DOUBLE_QUOTE;
break;
}
*p++ = c;
}
*p = '\0';
return res;
}