我完全是出自个人兴趣学习一下我们的词法,在假期有空去了解这个庞大的机器gcc的运行
词法分析的本质就是有一个自动机,首先我们可以识别出这个东西是标识符还是其他的东西
我们希望偏重代码的讲解,在lex.c里面,我们是实现了我们的代码
首先,我们进行c——lex_one_token,c_lex_direct和c_classify_number函数完成了所有符号的识别,
_c_lex_directr是符号的识别,cpp_classify_number实现对数字的
进入起始状态的代码如图所示:
cpp_token *
_cpp_lex_direct(cpp_reader *pfile)
{
fresh_line;
buffer = pfile->buffer;
skipped_white;
c=*buffer->cur++;
}
代码路径
switch (c)
{
case ' ': case 't': case 'f': case 'v': case ' ':
result->flags |= PREV_WHITE;
skip_whitespace (pfile, c);
goto skipped_white;
case 'n':
if (buffer->cur < buffer->rlimit)
CPP_INCREMENT_LINE (pfile, 0);
buffer->need_line = true;
goto fresh_line;
如果是这四个,如果走到n的时候,就是fresh_line
然后确符号为数字
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
{
struct normalize_state nst = INITIAL_NORMALIZE_STATE;
result->type = CPP_NUMBER;
lex_number (pfile, &result->val.str, &nst);单独的处理函数
warn_about_normalization (pfile, result, &nst);
break;
}
然后进入lex_number
case '.':
result->type = CPP_DOT;
if (ISDIGIT (*buffer->cur))
{
struct normalize_state nst = INITIAL_NORMALIZE_STATE;
result->type = CPP_NUMBER;
lex_number (pfile, &result->val.str, &nst);
warn_about_normalization (pfile, result, &nst);
}
else if (*buffer->cur == '.' && buffer->cur[1] == '.')
buffer->cur += 2, result->type = CPP_ELLIPSIS;
else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
buffer->cur++, result->type = CPP_DOT_STAR;
break;
类似的,我们符号的第一个是'L',''U',u','R'
确定符号为字符和字符串,符号的第一个字符是""或者""
确定符号为标识符的东西,在识别到"L""u""U""R"的时候我们就是标识符。然后其他的情况是放进函数进行判别
分析运算符和分隔符
如果/:
case ''':
case '"':
lex_string (pfile, result, buffer->cur - 1);//数字处理
break;
case '/':
/* A potential block or line comment. */
comment_start = buffer->cur;//获取后面的
c = *buffer->cur;
if (c == '*')
{
if (_cpp_skip_block_comment (pfile))
cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
}//遍历我们的东西
else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
|| cpp_in_system_header (pfile)))
{
/* Warn about comments only if pedantically GNUC89, and not
in system headers. */
if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
&& ! buffer->warned_cplusplus_comments)
{
cpp_error (pfile, CPP_DL_PEDWARN,
"C++ style comments are not allowed in ISO C90");
cpp_error (pfile, CPP_DL_PEDWARN,
"(this will be reported only once per input file)");
buffer->warned_cplusplus_comments = 1;
}
if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
cpp_error (pfile, CPP_DL_WARNING, "multi-line comment");
}
else if (c == '=')
{
buffer->cur++;
result->type = CPP_DIV_EQ;
break;
}
else
{
result->type = CPP_DIV;
break;
}
考虑"<"
case '<':
if (pfile->state.angled_headers)
{
lex_string (pfile, result, buffer->cur - 1);
if (result->type != CPP_LESS)
break;
}
result->type = CPP_LESS;
if (*buffer->cur == '=')
buffer->cur++, result->type = CPP_LESS_EQ;
else if (*buffer->cur == '<')
{
buffer->cur++;
IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
}
else if (CPP_OPTION (pfile, digraphs))
{
if (*buffer->cur == ':')
{
buffer->cur++;
result->flags |= DIGRAPH;
result->type = CPP_OPEN_SQUARE;
}
else if (*buffer->cur == '%')
{
buffer->cur++;
result->flags |= DIGRAPH;
result->type = CPP_OPEN_BRACE;
}
}
其他的算法一样,我们就不再赘述
然后是lex_identifier函数
static cpp_hashnode *
lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
struct normalize_state *nst)
{
cpp_hashnode *result;
const uchar *cur;
unsigned int len;
unsigned int hash = HT_HASHSTEP (0, *base);
cur = pfile->buffer->cur;
if (! starts_ucn)
while (ISIDNUM (*cur))
{
hash = HT_HASHSTEP (hash, *cur);
cur++;
}
pfile->buffer->cur = cur;
if (starts_ucn || forms_identifier_p (pfile, false, nst))
{
/* Slower version for identifiers containing UCNs (or $). */
do {
while (ISIDNUM (*pfile->buffer->cur))
{
pfile->buffer->cur++;
NORMALIZE_STATE_UPDATE_IDNUM (nst);
}
} while (forms_identifier_p (pfile, false, nst));
result = _cpp_interpret_identifier (pfile, base,
pfile->buffer->cur - base);
}
else
{
len = cur - base;
hash = HT_HASHFINISH (hash, len);
result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
base, len, hash, HT_ALLOC));
}
/* Rarely, identifiers require diagnostics when lexed. */
if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
&& !pfile->state.skipping, 0))
{
/* It is allowed to poison the same identifier twice. */
if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned "%s"",
NODE_NAME (result));
/* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
replacement list of a variadic macro. */
if (result == pfile->spec_nodes.n__VA_ARGS__
&& !pfile->state.va_args_ok)
cpp_error (pfile, CPP_DL_PEDWARN,
"__VA_ARGS__ can only appear in the expansion"
" of a C99 variadic macro");
}
return result;
}
这里我们主要那就是满足条件的可以
·然后就是数字的词法分析实现
static void
lex_number (cpp_reader *pfile, cpp_string *number,
struct normalize_state *nst)
{
const uchar *cur;
const uchar *base;
uchar *dest;
base = pfile->buffer->cur - 1;
do
{
cur = pfile->buffer->cur;
/* N.B. ISIDNUM does not include $. */
while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
{
cur++;
NORMALIZE_STATE_UPDATE_IDNUM (nst);
}
pfile->buffer->cur = cur;
}
while (forms_identifier_p (pfile, false, nst));
number->len = cur - base;
dest = _cpp_unaligned_alloc (pfile, number->len + 1);
memcpy (dest, base, number->len);
dest[number->len] = ' ';
number->text = dest;
}
/* Create a token of type TYPE with a literal spelling. */
static void
create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
unsigned int len, enum cpp_ttype type)
{
uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
memcpy (dest, base, len);
dest[len] = ' ';
token->type = type;
token->val.str.len = len;
token->val.str.text = dest;
}
发表评论