View of xos/usr/xsh/lex.c

/* Copyright (C) 2008  Emmanuel Varoquaux
 
   This file is part of XOS.
 
   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
 
   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
 
#include "lex.h"
 
#include "safe_malloc.h"
#include "error.h"
#include "vars.h"
 
#include <string.h>
#include <ctype.h>
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include <stdarg.h>
 
/* Lecture de caracteres */
 
lexer_getc_func_t lexer_getc_func;
 
static int pushed_back_buffer[2];
static int pushed_back_pos = 0;
 
static int lexer_getc()
{
  int c;
 
  c = pushed_back_pos ? pushed_back_buffer[--pushed_back_pos] : lexer_getc_func();
  if (c == '\n')
    line_number++;
  return c;
}
 
static void lexer_ungetc(int c)
{
  if (c == '\n')
    line_number--;
  pushed_back_buffer[pushed_back_pos++] = c;
}
 
/* Analyse lexicale */
 
#define WORD_GROW_SIZE 256
 
/* etats de l'analyseur lexical */
enum {
  LS_START,
  LS_START_BACKSLASH,
  LS_COMMENT,
 
  /* operateur */
  LS_OPERATOR,
  LS_OPERATOR_BACKSLASH,
 
  /* mot */
  LS_NUMBER,
  LS_NUMBER_BACKSLASH,
  LS_WORD,
  LS_WORD_BACKSLASH,
  LS_WORD_SINGLE_QUOTE,
  LS_WORD_DOUBLE_QUOTE,
  LS_WORD_DOUBLE_QUOTE_BACKSLASH
};
 
/* etats pour la reconnaissance des operateurs */
enum {
  OS_AND,
  OS_BAR,
  OS_GREATER,
  OS_LESS
};
 
static int lexer_state;
static int operator_state;
static int number;
static char *word;
static size_t word_size;
static int word_pos;
static int error;
 
struct token_struct current_token;
 
static inline int is_blank(int c)
{
  return isascii(c) && isblank(c);
}
 
static inline int is_digit(int c)
{
  return isascii(c) && isdigit(c);
}
 
static inline int is_meta(int c)
{
  return is_blank(c) || (c && strchr("|&;()<>\n", c));
}
 
static void init_number()
{
  number = 0;
}
 
static void add_digit(char c)
{
  number = number * 10 + (c - '0');
}
 
static void init_word()
{
  word = safe_malloc(WORD_GROW_SIZE);
  word_size = WORD_GROW_SIZE;
  word_pos = 0;
}
 
static void word_append(char c)
{
  if (word_pos + 1 == (int)word_size) {
    word = safe_realloc(word, word_size + WORD_GROW_SIZE);
    word_size += WORD_GROW_SIZE;
  }
  word[word_pos++] = c;
}
 
static void end_word()
{
  word[word_pos] = '\0';
}
 
static void free_word()
{
  free(word);
}
 
/* scanner */
static int read_lexeme()
{
  int c;
  int token_type;
 
  lexer_state = LS_START;
  while (1) {
    c = lexer_getc();
    switch (lexer_state) {
    case LS_START:
      if (c == EOF) {
        error = 0;
        return TT_END_OF_INPUT;
      }
      if (is_blank(c))
        continue;
      switch (c) {
      case '\\':
        lexer_state = LS_START_BACKSLASH;
        continue;
      case '|':
        lexer_state = LS_OPERATOR;
        operator_state = OS_BAR;
        continue;
      case '&':
        lexer_state = LS_OPERATOR;
        operator_state = OS_AND;
        continue;
      case ';':
        return TT_SEMICOLON;
      case '(':
        return TT_LEFT_PARENTHESIS;
      case ')':
        return TT_RIGHT_PARENTHESIS;
      case '<':
        lexer_state = LS_OPERATOR;
        operator_state = OS_LESS;
        continue;
      case '>':
        lexer_state = LS_OPERATOR;
        operator_state = OS_GREATER;
        continue;
      case '\n':
        return TT_NEWLINE;
      case '#':
        lexer_state = LS_COMMENT;
        continue;
      }
      init_word();
      if (is_digit(c)) {
        lexer_state = LS_NUMBER;
        init_number();
        goto add_digit;
      }
      else {
        lexer_state = LS_WORD;
        goto word;
      }
    case LS_START_BACKSLASH:
      if (c == EOF) {
        error = 0;
        return TT_END_OF_INPUT;
      }
      if (c == '\n') {
        lexer_state = LS_START;
        continue;
      }
      lexer_state = LS_WORD_BACKSLASH;
      init_word();
      goto word_backslash;
    case LS_COMMENT:
      if (c == EOF) {
        error = 0;
        return TT_END_OF_INPUT;
      }
      if (c == '\n')
        return TT_NEWLINE;
      continue;
 
      /* operateur */
    case LS_OPERATOR:
      if (c == '\\') {
        lexer_state = LS_OPERATOR_BACKSLASH;
        continue;
      }
    operator:
      switch (operator_state) {
      case OS_AND:
        if (c == '&')
          return TT_AND_AND;
        token_type = TT_AMPERSAND;
        break;
      case OS_BAR:
        if (c == '|')
          return TT_OR_OR;
        token_type = TT_BAR;
        break;
      case OS_GREATER:
        switch (c) {
        case '>':
          return TT_GREATER_GREATER;
        case '&':
          return TT_GREATER_AND;
        case '|':
          return TT_GREATER_BAR;
        }
        token_type = TT_GREATER;
        break;
      case OS_LESS:
      default:
        switch (c) {
        case '&':
          return TT_LESS_AND;
        case '>':
          return TT_LESS_GREATER;
        }
        token_type = TT_LESS;
        break;
      }
      lexer_ungetc(c);
      return token_type;
    case LS_OPERATOR_BACKSLASH:
      if (c == '\n') {
        lexer_state = LS_OPERATOR;
        continue;
      }
      lexer_ungetc(c);
      c = '\\';
      goto operator;
 
      /* mot */
    case LS_NUMBER:
      if (c == EOF)
        return TT_WORD;
      if (c == '\\') {
        lexer_state = LS_NUMBER_BACKSLASH;
        continue;
      }
      if (is_meta(c)) {
        lexer_ungetc(c);
        return c == '<' || c == '>' ? TT_NUMBER : TT_WORD;
      }
      if (!is_digit(c)) {
        lexer_state = LS_WORD;
        goto word;
      }
      goto add_digit;
    case LS_NUMBER_BACKSLASH:
      if (c == EOF)
        return TT_NUMBER;
      if (c == '\n') {
        lexer_state = LS_NUMBER;
        continue;
      }
      lexer_state = LS_WORD_BACKSLASH;
      goto word_backslash;
    case LS_WORD:
      if (c == EOF)
        return TT_WORD;
      if (is_meta(c)) {
        lexer_ungetc(c);
        return TT_WORD;
      }
    word:
      switch (c) {
      case '\\':
        lexer_state = LS_WORD_BACKSLASH;
        continue;
      case '\'':
        lexer_state = LS_WORD_SINGLE_QUOTE;
        break;
      case '"':
        lexer_state = LS_WORD_DOUBLE_QUOTE;
        break;
      }
      goto word_append;
    case LS_WORD_BACKSLASH:
      if (c == EOF)
        return TT_WORD;
      if (c == '\n') {
        lexer_state = LS_WORD;
        continue;
      }
    word_backslash:
      lexer_state = LS_WORD;
      word_append('\\');
      goto word_append;
    case LS_WORD_SINGLE_QUOTE:
      if (c == EOF)
        goto error_unexpected_end_of_file;
      if (c == '\'')
        lexer_state = LS_WORD;
      goto word_append;
    case LS_WORD_DOUBLE_QUOTE:
      if (c == EOF)
        goto error_unexpected_end_of_file;
      switch (c) {
      case '\\':
        lexer_state = LS_WORD_DOUBLE_QUOTE_BACKSLASH;
        continue;
      case '"':
        lexer_state = LS_WORD;
        break;
      }
      goto word_append;
    case LS_WORD_DOUBLE_QUOTE_BACKSLASH:
      if (c == EOF)
        goto error_unexpected_end_of_file;
      lexer_state = LS_WORD_DOUBLE_QUOTE;
      if (c == '\n')
        continue;
      word_append('\\');
      goto word_append;
    }
  add_digit:
    add_digit(c);
  word_append:
    word_append(c);
  }
 error_unexpected_end_of_file:
  syntax_error("unexpected end of file");
  free_word();
  error = 1;
  return TT_END_OF_INPUT;
}
 
/* tokenizer */
static void make_token(int type)
{
  current_token.type = type;
  switch (type) {
  case TT_NUMBER:
    free_word();
    current_token.number = number;
    break;
  case TT_WORD:
    end_word();
    current_token.word = word;
    break;
  case TT_END_OF_INPUT:
    current_token.error = error;
    break;
  }
}
 
void lex()
{
  make_token(read_lexeme());
}
 
void skip_line()
{
  int c;
 
  do
    c = lexer_getc();
  while (c != EOF && c != '\n');
  if (c == EOF) {
    error = 0;
    make_token(TT_END_OF_INPUT);
  }
  else
    make_token(TT_NEWLINE);
}
 
char *remove_quotes(const char *string)
{
  char *res, *p;
  int state;
  char c;
 
  res = safe_malloc(strlen(string) + 1);
  p = res;
  state = LS_WORD;
  while ((c = *string++)) {
    switch (state) {
    case LS_WORD:
      switch (c) {
      case '\\':
        state = LS_WORD_BACKSLASH;
        continue;
      case '\'':
        state = LS_WORD_SINGLE_QUOTE;
        continue;
      case '"':
        state = LS_WORD_DOUBLE_QUOTE;
        continue;
      }
      break;
    case LS_WORD_BACKSLASH:
      state = LS_WORD;
      break;
    case LS_WORD_SINGLE_QUOTE:
      if (c == '\'') {
        state = LS_WORD;
        continue;
      }
      break;
    case LS_WORD_DOUBLE_QUOTE:
      switch (c) {
      case '\\':
        state = LS_WORD_DOUBLE_QUOTE_BACKSLASH;
        continue;
      case '"':
        state = LS_WORD;
        continue;
      }
      break;
    case LS_WORD_DOUBLE_QUOTE_BACKSLASH:
      switch (c) {
      case '"':
      case '\\':
        break;
      default:
        *p++ = '\\';
      }
      state = LS_WORD_DOUBLE_QUOTE;
      break;
    }
    *p++ = c;
  }
  *p = '\0';
  return res;
}