/* Copyright (C) 2008 Emmanuel Varoquaux This file is part of XOS. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include "lex.h" #include "safe_malloc.h" #include "error.h" #include "vars.h" #include #include #include #include #include #include /* Lecture de caracteres */ lexer_getc_func_t lexer_getc_func; static int pushed_back_buffer[2]; static int pushed_back_pos = 0; static int lexer_getc() { int c; c = pushed_back_pos ? pushed_back_buffer[--pushed_back_pos] : lexer_getc_func(); if (c == '\n') line_number++; return c; } static void lexer_ungetc(int c) { if (c == '\n') line_number--; pushed_back_buffer[pushed_back_pos++] = c; } /* Analyse lexicale */ #define WORD_GROW_SIZE 256 /* etats de l'analyseur lexical */ enum { LS_START, LS_START_BACKSLASH, LS_COMMENT, /* operateur */ LS_OPERATOR, LS_OPERATOR_BACKSLASH, /* mot */ LS_NUMBER, LS_NUMBER_BACKSLASH, LS_WORD, LS_WORD_BACKSLASH, LS_WORD_SINGLE_QUOTE, LS_WORD_DOUBLE_QUOTE, LS_WORD_DOUBLE_QUOTE_BACKSLASH }; /* etats pour la reconnaissance des operateurs */ enum { OS_AND, OS_BAR, OS_GREATER, OS_LESS }; static int lexer_state; static int operator_state; static int number; static char *word; static size_t word_size; static int word_pos; static int error; struct token_struct current_token; static inline int is_blank(int c) { return isascii(c) && isblank(c); } static inline int is_digit(int c) { return isascii(c) && isdigit(c); } static inline int is_meta(int c) { return is_blank(c) || (c && strchr("|&;()<>\n", c)); } static void init_number() { number = 0; } static void add_digit(char c) { number = number * 10 + (c - '0'); } static void init_word() { word = safe_malloc(WORD_GROW_SIZE); word_size = WORD_GROW_SIZE; word_pos = 0; } static void word_append(char c) { if (word_pos + 1 == (int)word_size) { word = safe_realloc(word, word_size + WORD_GROW_SIZE); word_size += WORD_GROW_SIZE; } word[word_pos++] = c; } static void end_word() { word[word_pos] = '\0'; } static void free_word() { free(word); } /* scanner */ static int read_lexeme() { int c; int token_type; lexer_state = LS_START; while (1) { c = lexer_getc(); switch (lexer_state) { case LS_START: if (c == EOF) { error = 0; return TT_END_OF_INPUT; } if (is_blank(c)) continue; switch (c) { case '\\': lexer_state = LS_START_BACKSLASH; continue; case '|': lexer_state = LS_OPERATOR; operator_state = OS_BAR; continue; case '&': lexer_state = LS_OPERATOR; operator_state = OS_AND; continue; case ';': return TT_SEMICOLON; case '(': return TT_LEFT_PARENTHESIS; case ')': return TT_RIGHT_PARENTHESIS; case '<': lexer_state = LS_OPERATOR; operator_state = OS_LESS; continue; case '>': lexer_state = LS_OPERATOR; operator_state = OS_GREATER; continue; case '\n': return TT_NEWLINE; case '#': lexer_state = LS_COMMENT; continue; } init_word(); if (is_digit(c)) { lexer_state = LS_NUMBER; init_number(); goto add_digit; } else { lexer_state = LS_WORD; goto word; } case LS_START_BACKSLASH: if (c == EOF) { error = 0; return TT_END_OF_INPUT; } if (c == '\n') { lexer_state = LS_START; continue; } lexer_state = LS_WORD_BACKSLASH; init_word(); goto word_backslash; case LS_COMMENT: if (c == EOF) { error = 0; return TT_END_OF_INPUT; } if (c == '\n') return TT_NEWLINE; continue; /* operateur */ case LS_OPERATOR: if (c == '\\') { lexer_state = LS_OPERATOR_BACKSLASH; continue; } operator: switch (operator_state) { case OS_AND: if (c == '&') return TT_AND_AND; token_type = TT_AMPERSAND; break; case OS_BAR: if (c == '|') return TT_OR_OR; token_type = TT_BAR; break; case OS_GREATER: switch (c) { case '>': return TT_GREATER_GREATER; case '&': return TT_GREATER_AND; case '|': return TT_GREATER_BAR; } token_type = TT_GREATER; break; case OS_LESS: default: switch (c) { case '&': return TT_LESS_AND; case '>': return TT_LESS_GREATER; } token_type = TT_LESS; break; } lexer_ungetc(c); return token_type; case LS_OPERATOR_BACKSLASH: if (c == '\n') { lexer_state = LS_OPERATOR; continue; } lexer_ungetc(c); c = '\\'; goto operator; /* mot */ case LS_NUMBER: if (c == EOF) return TT_WORD; if (c == '\\') { lexer_state = LS_NUMBER_BACKSLASH; continue; } if (is_meta(c)) { lexer_ungetc(c); return c == '<' || c == '>' ? TT_NUMBER : TT_WORD; } if (!is_digit(c)) { lexer_state = LS_WORD; goto word; } goto add_digit; case LS_NUMBER_BACKSLASH: if (c == EOF) return TT_NUMBER; if (c == '\n') { lexer_state = LS_NUMBER; continue; } lexer_state = LS_WORD_BACKSLASH; goto word_backslash; case LS_WORD: if (c == EOF) return TT_WORD; if (is_meta(c)) { lexer_ungetc(c); return TT_WORD; } word: switch (c) { case '\\': lexer_state = LS_WORD_BACKSLASH; continue; case '\'': lexer_state = LS_WORD_SINGLE_QUOTE; break; case '"': lexer_state = LS_WORD_DOUBLE_QUOTE; break; } goto word_append; case LS_WORD_BACKSLASH: if (c == EOF) return TT_WORD; if (c == '\n') { lexer_state = LS_WORD; continue; } word_backslash: lexer_state = LS_WORD; word_append('\\'); goto word_append; case LS_WORD_SINGLE_QUOTE: if (c == EOF) goto error_unexpected_end_of_file; if (c == '\'') lexer_state = LS_WORD; goto word_append; case LS_WORD_DOUBLE_QUOTE: if (c == EOF) goto error_unexpected_end_of_file; switch (c) { case '\\': lexer_state = LS_WORD_DOUBLE_QUOTE_BACKSLASH; continue; case '"': lexer_state = LS_WORD; break; } goto word_append; case LS_WORD_DOUBLE_QUOTE_BACKSLASH: if (c == EOF) goto error_unexpected_end_of_file; lexer_state = LS_WORD_DOUBLE_QUOTE; if (c == '\n') continue; word_append('\\'); goto word_append; } add_digit: add_digit(c); word_append: word_append(c); } error_unexpected_end_of_file: syntax_error("unexpected end of file"); free_word(); error = 1; return TT_END_OF_INPUT; } /* tokenizer */ static void make_token(int type) { current_token.type = type; switch (type) { case TT_NUMBER: free_word(); current_token.number = number; break; case TT_WORD: end_word(); current_token.word = word; break; case TT_END_OF_INPUT: current_token.error = error; break; } } void lex() { make_token(read_lexeme()); } void skip_line() { int c; do c = lexer_getc(); while (c != EOF && c != '\n'); if (c == EOF) { error = 0; make_token(TT_END_OF_INPUT); } else make_token(TT_NEWLINE); } char *remove_quotes(const char *string) { char *res, *p; int state; char c; res = safe_malloc(strlen(string) + 1); p = res; state = LS_WORD; while ((c = *string++)) { switch (state) { case LS_WORD: switch (c) { case '\\': state = LS_WORD_BACKSLASH; continue; case '\'': state = LS_WORD_SINGLE_QUOTE; continue; case '"': state = LS_WORD_DOUBLE_QUOTE; continue; } break; case LS_WORD_BACKSLASH: state = LS_WORD; break; case LS_WORD_SINGLE_QUOTE: if (c == '\'') { state = LS_WORD; continue; } break; case LS_WORD_DOUBLE_QUOTE: switch (c) { case '\\': state = LS_WORD_DOUBLE_QUOTE_BACKSLASH; continue; case '"': state = LS_WORD; continue; } break; case LS_WORD_DOUBLE_QUOTE_BACKSLASH: switch (c) { case '"': case '\\': break; default: *p++ = '\\'; } state = LS_WORD_DOUBLE_QUOTE; break; } *p++ = c; } *p = '\0'; return res; }