diff options
Diffstat (limited to 'src/LexPerl.cpp')
-rwxr-xr-x | src/LexPerl.cpp | 1256 |
1 files changed, 1256 insertions, 0 deletions
diff --git a/src/LexPerl.cpp b/src/LexPerl.cpp new file mode 100755 index 0000000..4a42bc2 --- /dev/null +++ b/src/LexPerl.cpp @@ -0,0 +1,1256 @@ +// Scintilla source code edit control +/** @file LexPerl.cxx + ** Lexer for subset of Perl. + **/ +// Copyright 1998-2005 by Neil Hodgson <neilh@scintilla.org> +// Lexical analysis fixes by Kein-Hong Man <mkh@pl.jaring.my> +// The License.txt file describes the conditions under which this software may be distributed. + +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#include <stdio.h> +#include <stdarg.h> + +#include "Platform.h" + +#include "PropSet.h" +#include "Accessor.h" +#include "KeyWords.h" +#include "Scintilla.h" +#include "SciLexer.h" + +#define PERLNUM_BINARY 1 // order is significant: 1-4 cannot have a dot +#define PERLNUM_HEX 2 +#define PERLNUM_OCTAL 3 +#define PERLNUM_FLOAT 4 // actually exponent part +#define PERLNUM_DECIMAL 5 // 1-5 are numbers; 6-7 are strings +#define PERLNUM_VECTOR 6 +#define PERLNUM_V_VECTOR 7 +#define PERLNUM_BAD 8 + +#define BACK_NONE 0 // lookback state for bareword disambiguation: +#define BACK_OPERATOR 1 // whitespace/comments are insignificant +#define BACK_KEYWORD 2 // operators/keywords are needed for disambiguation + +#define HERE_DELIM_MAX 256 + +static inline bool isEOLChar(char ch) { + return (ch == '\r') || (ch == '\n'); +} + +static bool isSingleCharOp(char ch) { + char strCharSet[2]; + strCharSet[0] = ch; + strCharSet[1] = '\0'; + return (NULL != strstr("rwxoRWXOezsfdlpSbctugkTBMAC", strCharSet)); +} + +static inline bool isPerlOperator(char ch) { + if (ch == '^' || ch == '&' || ch == '\\' || + ch == '(' || ch == ')' || ch == '-' || ch == '+' || + ch == '=' || ch == '|' || ch == '{' || ch == '}' || + ch == '[' || ch == ']' || ch == ':' || ch == ';' || + ch == '>' || ch == ',' || + ch == '?' || ch == '!' || ch == '.' || ch == '~') + return true; + // these chars are already tested before this call + // ch == '%' || ch == '*' || ch == '<' || ch == '/' || + return false; +} + +static bool isPerlKeyword(unsigned int start, unsigned int end, WordList &keywords, Accessor &styler) { + char s[100]; + unsigned int i, len = end - start; + if (len > 30) { len = 30; } + for (i = 0; i < len; i++, start++) s[i] = styler[start]; + s[i] = '\0'; + return keywords.InList(s); +} + +// Note: as lexer uses chars, UTF-8 bytes are considered as <0 values +// Note: iswordchar() was used in only one place in LexPerl, it is +// unnecessary as '.' is processed as the concatenation operator, so +// only isWordStart() is used in LexPerl + +static inline bool isWordStart(char ch) { + return !isascii(ch) || isalnum(ch) || ch == '_'; +} + +static inline bool isEndVar(char ch) { + return isascii(ch) && !isalnum(ch) && ch != '#' && ch != '$' && + ch != '_' && ch != '\''; +} + +static inline bool isNonQuote(char ch) { + return !isascii(ch) || isalnum(ch) || ch == '_'; +} + +static inline char actualNumStyle(int numberStyle) { + if (numberStyle == PERLNUM_VECTOR || numberStyle == PERLNUM_V_VECTOR) { + return SCE_PL_STRING; + } else if (numberStyle == PERLNUM_BAD) { + return SCE_PL_ERROR; + } + return SCE_PL_NUMBER; +} + +static bool isMatch(Accessor &styler, int lengthDoc, int pos, const char *val) { + if ((pos + static_cast<int>(strlen(val))) >= lengthDoc) { + return false; + } + while (*val) { + if (*val != styler[pos++]) { + return false; + } + val++; + } + return true; +} + +static char opposite(char ch) { + if (ch == '(') + return ')'; + if (ch == '[') + return ']'; + if (ch == '{') + return '}'; + if (ch == '<') + return '>'; + return ch; +} + +static void ColourisePerlDoc(unsigned int startPos, int length, int initStyle, + WordList *keywordlists[], Accessor &styler) { + + // Lexer for perl often has to backtrack to start of current style to determine + // which characters are being used as quotes, how deeply nested is the + // start position and what the termination string is for here documents + + WordList &keywords = *keywordlists[0]; + + class HereDocCls { + public: + int State; // 0: '<<' encountered + // 1: collect the delimiter + // 2: here doc text (lines after the delimiter) + char Quote; // the char after '<<' + bool Quoted; // true if Quote in ('\'','"','`') + int DelimiterLength; // strlen(Delimiter) + char *Delimiter; // the Delimiter, 256: sizeof PL_tokenbuf + HereDocCls() { + State = 0; + Quote = 0; + Quoted = false; + DelimiterLength = 0; + Delimiter = new char[HERE_DELIM_MAX]; + Delimiter[0] = '\0'; + } + ~HereDocCls() { + delete []Delimiter; + } + }; + HereDocCls HereDoc; // TODO: FIFO for stacked here-docs + + class QuoteCls { + public: + int Rep; + int Count; + char Up; + char Down; + QuoteCls() { + this->New(1); + } + void New(int r) { + Rep = r; + Count = 0; + Up = '\0'; + Down = '\0'; + } + void Open(char u) { + Count++; + Up = u; + Down = opposite(Up); + } + }; + QuoteCls Quote; + + int state = initStyle; + char numState = PERLNUM_DECIMAL; + int dotCount = 0; + unsigned int lengthDoc = startPos + length; + //int sookedpos = 0; // these have no apparent use, see POD state + //char sooked[100]; + //sooked[sookedpos] = '\0'; + + // If in a long distance lexical state, seek to the beginning to find quote characters + // Perl strings can be multi-line with embedded newlines, so backtrack. + // Perl numbers have additional state during lexing, so backtrack too. + if (state == SCE_PL_HERE_Q || state == SCE_PL_HERE_QQ || state == SCE_PL_HERE_QX) { + while ((startPos > 1) && (styler.StyleAt(startPos) != SCE_PL_HERE_DELIM)) { + startPos--; + } + startPos = styler.LineStart(styler.GetLine(startPos)); + state = styler.StyleAt(startPos - 1); + } + if ( state == SCE_PL_STRING_Q + || state == SCE_PL_STRING_QQ + || state == SCE_PL_STRING_QX + || state == SCE_PL_STRING_QR + || state == SCE_PL_STRING_QW + || state == SCE_PL_REGEX + || state == SCE_PL_REGSUBST + || state == SCE_PL_STRING + || state == SCE_PL_BACKTICKS + || state == SCE_PL_CHARACTER + || state == SCE_PL_NUMBER + || state == SCE_PL_IDENTIFIER + || state == SCE_PL_ERROR + ) { + while ((startPos > 1) && (styler.StyleAt(startPos - 1) == state)) { + startPos--; + } + state = SCE_PL_DEFAULT; + } + + // lookback at start of lexing to set proper state for backflag + // after this, they are updated when elements are lexed + int backflag = BACK_NONE; + unsigned int backPos = startPos; + if (backPos > 0) { + backPos--; + int sty = SCE_PL_DEFAULT; + while ((backPos > 0) && (sty = styler.StyleAt(backPos), + sty == SCE_PL_DEFAULT || sty == SCE_PL_COMMENTLINE)) + backPos--; + if (sty == SCE_PL_OPERATOR) + backflag = BACK_OPERATOR; + else if (sty == SCE_PL_WORD) + backflag = BACK_KEYWORD; + } + + styler.StartAt(startPos); + char chPrev = styler.SafeGetCharAt(startPos - 1); + if (startPos == 0) + chPrev = '\n'; + char chNext = styler[startPos]; + styler.StartSegment(startPos); + + for (unsigned int i = startPos; i < lengthDoc; i++) { + char ch = chNext; + // if the current character is not consumed due to the completion of an + // earlier style, lexing can be restarted via a simple goto + restartLexer: + chNext = styler.SafeGetCharAt(i + 1); + char chNext2 = styler.SafeGetCharAt(i + 2); + + if (styler.IsLeadByte(ch)) { + chNext = styler.SafeGetCharAt(i + 2); + chPrev = ' '; + i += 1; + continue; + } + if ((chPrev == '\r' && ch == '\n')) { // skip on DOS/Windows + styler.ColourTo(i, state); + chPrev = ch; + continue; + } + + if (HereDoc.State == 1 && isEOLChar(ch)) { + // Begin of here-doc (the line after the here-doc delimiter): + // Lexically, the here-doc starts from the next line after the >>, but the + // first line of here-doc seem to follow the style of the last EOL sequence + HereDoc.State = 2; + if (HereDoc.Quoted) { + if (state == SCE_PL_HERE_DELIM) { + // Missing quote at end of string! We are stricter than perl. + // Colour here-doc anyway while marking this bit as an error. + state = SCE_PL_ERROR; + } + styler.ColourTo(i - 1, state); + switch (HereDoc.Quote) { + case '\'': + state = SCE_PL_HERE_Q ; + break; + case '"': + state = SCE_PL_HERE_QQ; + break; + case '`': + state = SCE_PL_HERE_QX; + break; + } + } else { + styler.ColourTo(i - 1, state); + switch (HereDoc.Quote) { + case '\\': + state = SCE_PL_HERE_Q ; + break; + default : + state = SCE_PL_HERE_QQ; + } + } + } + + if (state == SCE_PL_DEFAULT) { + if ((isascii(ch) && isdigit(ch)) || (isascii(chNext) && isdigit(chNext) && + (ch == '.' || ch == 'v'))) { + state = SCE_PL_NUMBER; + backflag = BACK_NONE; + numState = PERLNUM_DECIMAL; + dotCount = 0; + if (ch == '0') { // hex,bin,octal + if (chNext == 'x') { + numState = PERLNUM_HEX; + } else if (chNext == 'b') { + numState = PERLNUM_BINARY; + } else if (isascii(chNext) && isdigit(chNext)) { + numState = PERLNUM_OCTAL; + } + if (numState != PERLNUM_DECIMAL) { + i++; + ch = chNext; + chNext = chNext2; + } + } else if (ch == 'v') { // vector + numState = PERLNUM_V_VECTOR; + } + } else if (isWordStart(ch)) { + // if immediately prefixed by '::', always a bareword + state = SCE_PL_WORD; + if (chPrev == ':' && styler.SafeGetCharAt(i - 2) == ':') { + state = SCE_PL_IDENTIFIER; + } + unsigned int kw = i + 1; + // first check for possible quote-like delimiter + if (ch == 's' && !isNonQuote(chNext)) { + state = SCE_PL_REGSUBST; + Quote.New(2); + } else if (ch == 'm' && !isNonQuote(chNext)) { + state = SCE_PL_REGEX; + Quote.New(1); + } else if (ch == 'q' && !isNonQuote(chNext)) { + state = SCE_PL_STRING_Q; + Quote.New(1); + } else if (ch == 'y' && !isNonQuote(chNext)) { + state = SCE_PL_REGSUBST; + Quote.New(2); + } else if (ch == 't' && chNext == 'r' && !isNonQuote(chNext2)) { + state = SCE_PL_REGSUBST; + Quote.New(2); + kw++; + } else if (ch == 'q' && (chNext == 'q' || chNext == 'r' || chNext == 'w' || chNext == 'x') && !isNonQuote(chNext2)) { + if (chNext == 'q') state = SCE_PL_STRING_QQ; + else if (chNext == 'x') state = SCE_PL_STRING_QX; + else if (chNext == 'r') state = SCE_PL_STRING_QR; + else if (chNext == 'w') state = SCE_PL_STRING_QW; + Quote.New(1); + kw++; + } else if (ch == 'x' && (chNext == '=' || // repetition + !isWordStart(chNext) || + (isdigit(chPrev) && isdigit(chNext)))) { + state = SCE_PL_OPERATOR; + } + // if potentially a keyword, scan forward and grab word, then check + // if it's really one; if yes, disambiguation test is performed + // otherwise it is always a bareword and we skip a lot of scanning + // note: keywords assumed to be limited to [_a-zA-Z] only + if (state == SCE_PL_WORD) { + while (isWordStart(styler.SafeGetCharAt(kw))) kw++; + if (!isPerlKeyword(styler.GetStartSegment(), kw, keywords, styler)) { + state = SCE_PL_IDENTIFIER; + } + } + // if already SCE_PL_IDENTIFIER, then no ambiguity, skip this + // for quote-like delimiters/keywords, attempt to disambiguate + // to select for bareword, change state -> SCE_PL_IDENTIFIER + if (state != SCE_PL_IDENTIFIER && i > 0) { + unsigned int j = i; + bool moreback = false; // true if passed newline/comments + bool brace = false; // true if opening brace found + char ch2; + // first look backwards past whitespace/comments for EOLs + // if BACK_NONE, neither operator nor keyword, so skip test + if (backflag != BACK_NONE) { + while (--j > backPos) { + if (isEOLChar(styler.SafeGetCharAt(j))) + moreback = true; + } + ch2 = styler.SafeGetCharAt(j); + if (ch2 == '{' && !moreback) { + // {bareword: possible variable spec + brace = true; + } else if ((ch2 == '&' && styler.SafeGetCharAt(j - 1) != '&') + // &bareword: subroutine call + || (ch2 == '>' && styler.SafeGetCharAt(j - 1) == '-') + // ->bareword: part of variable spec + || (ch2 == 'b' && styler.Match(j - 2, "su"))) { + // sub bareword: subroutine declaration + // (implied BACK_KEYWORD, no keywords end in 'sub'!) + state = SCE_PL_IDENTIFIER; + } + // if status still ambiguous, look forward after word past + // tabs/spaces only; if ch2 isn't one of '[{(,' it can never + // match anything, so skip the whole thing + j = kw; + if (state != SCE_PL_IDENTIFIER + && (ch2 == '{' || ch2 == '(' || ch2 == '['|| ch2 == ',') + && kw < lengthDoc) { + while (ch2 = styler.SafeGetCharAt(j), + (ch2 == ' ' || ch2 == '\t') && j < lengthDoc) { + j++; + } + if ((ch2 == '}' && brace) + // {bareword}: variable spec + || (ch2 == '=' && styler.SafeGetCharAt(j + 1) == '>')) { + // [{(, bareword=>: hash literal + state = SCE_PL_IDENTIFIER; + } + } + } + } + backflag = BACK_NONE; + // an identifier or bareword + if (state == SCE_PL_IDENTIFIER) { + if ((!isWordStart(chNext) && chNext != '\'') + || (chNext == '.' && chNext2 == '.')) { + // We need that if length of word == 1! + // This test is copied from the SCE_PL_WORD handler. + styler.ColourTo(i, SCE_PL_IDENTIFIER); + state = SCE_PL_DEFAULT; + } + // a keyword + } else if (state == SCE_PL_WORD) { + i = kw - 1; + if (ch == '_' && chNext == '_' && + (isMatch(styler, lengthDoc, styler.GetStartSegment(), "__DATA__") + || isMatch(styler, lengthDoc, styler.GetStartSegment(), "__END__"))) { + styler.ColourTo(i, SCE_PL_DATASECTION); + state = SCE_PL_DATASECTION; + } else { + styler.ColourTo(i, SCE_PL_WORD); + state = SCE_PL_DEFAULT; + backflag = BACK_KEYWORD; + backPos = i; + } + ch = styler.SafeGetCharAt(i); + chNext = styler.SafeGetCharAt(i + 1); + // a repetition operator 'x' + } else if (state == SCE_PL_OPERATOR) { + styler.ColourTo(i, SCE_PL_OPERATOR); + state = SCE_PL_DEFAULT; + // quote-like delimiter, skip one char if double-char delimiter + } else { + i = kw - 1; + chNext = styler.SafeGetCharAt(i + 1); + } + } else if (ch == '#') { + state = SCE_PL_COMMENTLINE; + } else if (ch == '\"') { + state = SCE_PL_STRING; + Quote.New(1); + Quote.Open(ch); + backflag = BACK_NONE; + } else if (ch == '\'') { + if (chPrev == '&') { + // Archaic call + styler.ColourTo(i, state); + } else { + state = SCE_PL_CHARACTER; + Quote.New(1); + Quote.Open(ch); + } + backflag = BACK_NONE; + } else if (ch == '`') { + state = SCE_PL_BACKTICKS; + Quote.New(1); + Quote.Open(ch); + backflag = BACK_NONE; + } else if (ch == '$') { + if ((chNext == '{') || isspacechar(chNext)) { + styler.ColourTo(i, SCE_PL_SCALAR); + } else { + state = SCE_PL_SCALAR; + if ((chNext == '`' && chNext2 == '`') + || (chNext == ':' && chNext2 == ':')) { + i += 2; + ch = styler.SafeGetCharAt(i); + chNext = styler.SafeGetCharAt(i + 1); + } else { + i++; + ch = chNext; + chNext = chNext2; + } + } + backflag = BACK_NONE; + } else if (ch == '@') { + if (!isascii(chNext) || isalpha(chNext) || chNext == '#' || chNext == '$' + || chNext == '_' || chNext == '+' || chNext == '-') { + state = SCE_PL_ARRAY; + } else if (chNext == ':' && chNext2 == ':') { + state = SCE_PL_ARRAY; + i += 2; + ch = styler.SafeGetCharAt(i); + chNext = styler.SafeGetCharAt(i + 1); + } else if (chNext != '{' && chNext != '[') { + styler.ColourTo(i, SCE_PL_ARRAY); + } else { + styler.ColourTo(i, SCE_PL_ARRAY); + } + backflag = BACK_NONE; + } else if (ch == '%') { + if (!isascii(chNext) || isalpha(chNext) || chNext == '#' || chNext == '$' + || chNext == '_' || chNext == '!' || chNext == '^') { + state = SCE_PL_HASH; + i++; + ch = chNext; + chNext = chNext2; + } else if (chNext == ':' && chNext2 == ':') { + state = SCE_PL_HASH; + i += 2; + ch = styler.SafeGetCharAt(i); + chNext = styler.SafeGetCharAt(i + 1); + } else if (chNext == '{') { + styler.ColourTo(i, SCE_PL_HASH); + } else { + styler.ColourTo(i, SCE_PL_OPERATOR); + } + backflag = BACK_NONE; + } else if (ch == '*') { + char strch[2]; + strch[0] = chNext; + strch[1] = '\0'; + if (chNext == ':' && chNext2 == ':') { + state = SCE_PL_SYMBOLTABLE; + i += 2; + ch = styler.SafeGetCharAt(i); + chNext = styler.SafeGetCharAt(i + 1); + } else if (!isascii(chNext) || isalpha(chNext) || chNext == '_' + || NULL != strstr("^/|,\\\";#%^:?<>)[]", strch)) { + state = SCE_PL_SYMBOLTABLE; + i++; + ch = chNext; + chNext = chNext2; + } else if (chNext == '{') { + styler.ColourTo(i, SCE_PL_SYMBOLTABLE); + } else { + if (chNext == '*') { // exponentiation + i++; + ch = chNext; + chNext = chNext2; + } + styler.ColourTo(i, SCE_PL_OPERATOR); + } + backflag = BACK_NONE; + } else if (ch == '/' || (ch == '<' && chNext == '<')) { + // Explicit backward peeking to set a consistent preferRE for + // any slash found, so no longer need to track preferRE state. + // Find first previous significant lexed element and interpret. + // Test for HERE doc start '<<' shares this code, helps to + // determine if it should be an operator. + bool preferRE = false; + bool isHereDoc = (ch == '<'); + bool hereDocSpace = false; // these are for corner case: + bool hereDocScalar = false; // SCALAR [whitespace] '<<' + unsigned int bk = (i > 0)? i - 1: 0; + char bkch; + styler.Flush(); + if (styler.StyleAt(bk) == SCE_PL_DEFAULT) + hereDocSpace = true; + while ((bk > 0) && (styler.StyleAt(bk) == SCE_PL_DEFAULT || + styler.StyleAt(bk) == SCE_PL_COMMENTLINE)) { + bk--; + } + if (bk == 0) { + // position 0 won't really be checked; rarely happens + // hard to fix due to an unsigned index i + preferRE = true; + } else { + int bkstyle = styler.StyleAt(bk); + bkch = styler.SafeGetCharAt(bk); + switch(bkstyle) { + case SCE_PL_OPERATOR: + preferRE = true; + if (bkch == ')' || bkch == ']') { + preferRE = false; + } else if (bkch == '}') { + // backtrack further, count balanced brace pairs + // if a brace pair found, see if it's a variable + int braceCount = 1; + while (--bk > 0) { + bkstyle = styler.StyleAt(bk); + if (bkstyle == SCE_PL_OPERATOR) { + bkch = styler.SafeGetCharAt(bk); + if (bkch == ';') { // early out + break; + } else if (bkch == '}') { + braceCount++; + } else if (bkch == '{') { + if (--braceCount == 0) + break; + } + } + } + if (bk == 0) { + // at beginning, true + } else if (braceCount == 0) { + // balanced { found, bk>0, skip more whitespace + if (styler.StyleAt(--bk) == SCE_PL_DEFAULT) { + while (bk > 0) { + bkstyle = styler.StyleAt(--bk); + if (bkstyle != SCE_PL_DEFAULT) + break; + } + } + bkstyle = styler.StyleAt(bk); + if (bkstyle == SCE_PL_SCALAR + || bkstyle == SCE_PL_ARRAY + || bkstyle == SCE_PL_HASH + || bkstyle == SCE_PL_SYMBOLTABLE + || bkstyle == SCE_PL_OPERATOR) { + preferRE = false; + } + } + } + break; + case SCE_PL_IDENTIFIER: + preferRE = true; + if (bkch == '>') { // inputsymbol + preferRE = false; + break; + } + // backtrack to find "->" or "::" before identifier + while (bk > 0 && styler.StyleAt(bk) == SCE_PL_IDENTIFIER) { + bk--; + } + while (bk > 0) { + bkstyle = styler.StyleAt(bk); + if (bkstyle == SCE_PL_DEFAULT || + bkstyle == SCE_PL_COMMENTLINE) { + } else if (bkstyle == SCE_PL_OPERATOR) { + // gcc 3.2.3 bloats if more compact form used + bkch = styler.SafeGetCharAt(bk); + if (bkch == '>') { // "->" + if (styler.SafeGetCharAt(bk - 1) == '-') { + preferRE = false; + break; + } + } else if (bkch == ':') { // "::" + if (styler.SafeGetCharAt(bk - 1) == ':') { + preferRE = false; + break; + } + } + } else {// bare identifier, usually a function call but Perl + // optimizes them as pseudo-constants, then the next + // '/' will be a divide; favour divide over regex + // if there is a whitespace after the '/' + if (isspacechar(chNext)) { + preferRE = false; + } + break; + } + bk--; + } + break; + case SCE_PL_SCALAR: // for $var<< case + hereDocScalar = true; + break; + // other styles uses the default, preferRE=false + case SCE_PL_WORD: + case SCE_PL_POD: + case SCE_PL_POD_VERB: + case SCE_PL_HERE_Q: + case SCE_PL_HERE_QQ: + case SCE_PL_HERE_QX: + preferRE = true; + break; + } + } + if (isHereDoc) { // handle HERE doc + // if SCALAR whitespace '<<', *always* a HERE doc + if (preferRE || (hereDocSpace && hereDocScalar)) { + state = SCE_PL_HERE_DELIM; + HereDoc.State = 0; + } else { // << operator + i++; + ch = chNext; + chNext = chNext2; + styler.ColourTo(i, SCE_PL_OPERATOR); + } + } else { // handle regexp + if (preferRE) { + state = SCE_PL_REGEX; + Quote.New(1); + Quote.Open(ch); + } else { // / operator + styler.ColourTo(i, SCE_PL_OPERATOR); + } + } + backflag = BACK_NONE; + } else if (ch == '<') { + // looks forward for matching > on same line + unsigned int fw = i + 1; + while (fw < lengthDoc) { + char fwch = styler.SafeGetCharAt(fw); + if (fwch == ' ') { + if (styler.SafeGetCharAt(fw-1) != '\\' || + styler.SafeGetCharAt(fw-2) != '\\') + break; + } else if (isEOLChar(fwch) || isspacechar(fwch)) { + break; + } else if (fwch == '>') { + if ((fw - i) == 2 && // '<=>' case + styler.SafeGetCharAt(fw-1) == '=') { + styler.ColourTo(fw, SCE_PL_OPERATOR); + } else { + styler.ColourTo(fw, SCE_PL_IDENTIFIER); + } + i = fw; + ch = fwch; + chNext = styler.SafeGetCharAt(i+1); + } + fw++; + } + styler.ColourTo(i, SCE_PL_OPERATOR); + backflag = BACK_NONE; + } else if (ch == '=' // POD + && isalpha(chNext) + && (isEOLChar(chPrev))) { + state = SCE_PL_POD; + backflag = BACK_NONE; + //sookedpos = 0; + //sooked[sookedpos] = '\0'; + } else if (ch == '-' // file test operators + && isSingleCharOp(chNext) + && !isalnum((chNext2 = styler.SafeGetCharAt(i+2)))) { + styler.ColourTo(i + 1, SCE_PL_WORD); + state = SCE_PL_DEFAULT; + i++; + ch = chNext; + chNext = chNext2; + backflag = BACK_NONE; + } else if (isPerlOperator(ch)) { + if (ch == '.' && chNext == '.') { // .. and ... + i++; + if (chNext2 == '.') { i++; } + state = SCE_PL_DEFAULT; + ch = styler.SafeGetCharAt(i); + chNext = styler.SafeGetCharAt(i + 1); + } + styler.ColourTo(i, SCE_PL_OPERATOR); + backflag = BACK_OPERATOR; + backPos = i; + } else { + // keep colouring defaults to make restart easier + styler.ColourTo(i, SCE_PL_DEFAULT); + } + } else if (state == SCE_PL_NUMBER) { + if (ch == '.') { + if (chNext == '.') { + // double dot is always an operator + goto numAtEnd; + } else if (numState <= PERLNUM_FLOAT) { + // non-decimal number or float exponent, consume next dot + styler.ColourTo(i - 1, SCE_PL_NUMBER); + styler.ColourTo(i, SCE_PL_OPERATOR); + state = SCE_PL_DEFAULT; + } else { // decimal or vectors allows dots + dotCount++; + if (numState == PERLNUM_DECIMAL) { + if (dotCount > 1) { + if (isdigit(chNext)) { // really a vector + numState = PERLNUM_VECTOR; + } else // number then dot + goto numAtEnd; + } + } else { // vectors + if (!isdigit(chNext)) // vector then dot + goto numAtEnd; + } + } + } else if (ch == '_' && numState == PERLNUM_DECIMAL) { + if (!isdigit(chNext)) { + goto numAtEnd; + } + } else if (!isascii(ch) || isalnum(ch)) { + if (numState == PERLNUM_VECTOR || numState == PERLNUM_V_VECTOR) { + if (!isascii(ch) || isalpha(ch)) { + if (dotCount == 0) { // change to word + state = SCE_PL_IDENTIFIER; + } else { // vector then word + goto numAtEnd; + } + } + } else if (numState == PERLNUM_DECIMAL) { + if (ch == 'E' || ch == 'e') { // exponent + numState = PERLNUM_FLOAT; + if (chNext == '+' || chNext == '-') { + i++; + ch = chNext; + chNext = chNext2; + } + } else if (!isascii(ch) || !isdigit(ch)) { // number then word + goto numAtEnd; + } + } else if (numState == PERLNUM_FLOAT) { + if (!isdigit(ch)) { // float then word + goto numAtEnd; + } + } else if (numState == PERLNUM_OCTAL) { + if (!isdigit(ch)) + goto numAtEnd; + else if (ch > '7') + numState = PERLNUM_BAD; + } else if (numState == PERLNUM_BINARY) { + if (!isdigit(ch)) + goto numAtEnd; + else if (ch > '1') + numState = PERLNUM_BAD; + } else if (numState == PERLNUM_HEX) { + int ch2 = toupper(ch); + if (!isdigit(ch) && !(ch2 >= 'A' && ch2 <= 'F')) + goto numAtEnd; + } else {//(numState == PERLNUM_BAD) { + if (!isdigit(ch)) + goto numAtEnd; + } + } else { + // complete current number or vector + numAtEnd: + styler.ColourTo(i - 1, actualNumStyle(numState)); + state = SCE_PL_DEFAULT; + goto restartLexer; + } + } else if (state == SCE_PL_IDENTIFIER) { + if (!isWordStart(chNext) && chNext != '\'') { + styler.ColourTo(i, SCE_PL_IDENTIFIER); + state = SCE_PL_DEFAULT; + ch = ' '; + } + } else { + if (state == SCE_PL_COMMENTLINE) { + if (isEOLChar(ch)) { + styler.ColourTo(i - 1, state); + state = SCE_PL_DEFAULT; + goto restartLexer; + } else if (isEOLChar(chNext)) { + styler.ColourTo(i, state); + state = SCE_PL_DEFAULT; + } + } else if (state == SCE_PL_HERE_DELIM) { + // + // From perldata.pod: + // ------------------ + // A line-oriented form of quoting is based on the shell ``here-doc'' + // syntax. + // Following a << you specify a string to terminate the quoted material, + // and all lines following the current line down to the terminating + // string are the value of the item. + // The terminating string may be either an identifier (a word), + // or some quoted text. + // If quoted, the type of quotes you use determines the treatment of + // the text, just as in regular quoting. + // An unquoted identifier works like double quotes. + // There must be no space between the << and the identifier. + // (If you put a space it will be treated as a null identifier, + // which is valid, and matches the first empty line.) + // (This is deprecated, -w warns of this syntax) + // The terminating string must appear by itself (unquoted and with no + // surrounding whitespace) on the terminating line. + // + // From Bash info: + // --------------- + // Specifier format is: <<[-]WORD + // Optional '-' is for removal of leading tabs from here-doc. + // Whitespace acceptable after <<[-] operator. + // + if (HereDoc.State == 0) { // '<<' encountered + bool gotspace = false; + unsigned int oldi = i; + if (chNext == ' ' || chNext == '\t') { + // skip whitespace; legal for quoted delimiters + gotspace = true; + do { + i++; + chNext = styler.SafeGetCharAt(i + 1); + } while ((i + 1 < lengthDoc) && (chNext == ' ' || chNext == '\t')); + chNext2 = styler.SafeGetCharAt(i + 2); + } + HereDoc.State = 1; + HereDoc.Quote = chNext; + HereDoc.Quoted = false; + HereDoc.DelimiterLength = 0; + HereDoc.Delimiter[HereDoc.DelimiterLength] = '\0'; + if (chNext == '\'' || chNext == '"' || chNext == '`') { + // a quoted here-doc delimiter + i++; + ch = chNext; + chNext = chNext2; + HereDoc.Quoted = true; + } else if (isspacechar(chNext) || isdigit(chNext) || chNext == '\\' + || chNext == '=' || chNext == '$' || chNext == '@' + || ((isalpha(chNext) || chNext == '_') && gotspace)) { + // left shift << or <<= operator cases + // restore position if operator + i = oldi; + styler.ColourTo(i, SCE_PL_OPERATOR); + state = SCE_PL_DEFAULT; + HereDoc.State = 0; + goto restartLexer; + } else { + // an unquoted here-doc delimiter, no special handling + // (cannot be prefixed by spaces/tabs), or + // symbols terminates; deprecated zero-length delimiter + } + + } else if (HereDoc.State == 1) { // collect the delimiter + backflag = BACK_NONE; + if (HereDoc.Quoted) { // a quoted here-doc delimiter + if (ch == HereDoc.Quote) { // closing quote => end of delimiter + styler.ColourTo(i, state); + state = SCE_PL_DEFAULT; + } else { + if (ch == '\\' && chNext == HereDoc.Quote) { // escaped quote + i++; + ch = chNext; + chNext = chNext2; + } + HereDoc.Delimiter[HereDoc.DelimiterLength++] = ch; + HereDoc.Delimiter[HereDoc.DelimiterLength] = '\0'; + } + } else { // an unquoted here-doc delimiter + if (isalnum(ch) || ch == '_') { + HereDoc.Delimiter[HereDoc.DelimiterLength++] = ch; + HereDoc.Delimiter[HereDoc.DelimiterLength] = '\0'; + } else { + styler.ColourTo(i - 1, state); + state = SCE_PL_DEFAULT; + goto restartLexer; + } + } + if (HereDoc.DelimiterLength >= HERE_DELIM_MAX - 1) { + styler.ColourTo(i - 1, state); + state = SCE_PL_ERROR; + goto restartLexer; + } + } + } else if (HereDoc.State == 2) { + // state == SCE_PL_HERE_Q || state == SCE_PL_HERE_QQ || state == SCE_PL_HERE_QX + if (isEOLChar(chPrev) && isMatch(styler, lengthDoc, i, HereDoc.Delimiter)) { + i += HereDoc.DelimiterLength; + chPrev = styler.SafeGetCharAt(i - 1); + ch = styler.SafeGetCharAt(i); + if (isEOLChar(ch)) { + styler.ColourTo(i - 1, state); + state = SCE_PL_DEFAULT; + backflag = BACK_NONE; + HereDoc.State = 0; + goto restartLexer; + } + chNext = styler.SafeGetCharAt(i + 1); + } + } else if (state == SCE_PL_POD + || state == SCE_PL_POD_VERB) { + if (isEOLChar(chPrev)) { + if (ch == ' ' || ch == '\t') { + styler.ColourTo(i - 1, state); + state = SCE_PL_POD_VERB; + } else { + styler.ColourTo(i - 1, state); + state = SCE_PL_POD; + if (ch == '=') { + if (isMatch(styler, lengthDoc, i, "=cut")) { + styler.ColourTo(i - 1 + 4, state); + i += 4; + state = SCE_PL_DEFAULT; + ch = styler.SafeGetCharAt(i); + //chNext = styler.SafeGetCharAt(i + 1); + goto restartLexer; + } + } + } + } + } else if (state == SCE_PL_SCALAR // variable names + || state == SCE_PL_ARRAY + || state == SCE_PL_HASH + || state == SCE_PL_SYMBOLTABLE) { + if (ch == ':' && chNext == ':') { // skip :: + i++; + ch = chNext; + chNext = chNext2; + } + else if (isEndVar(ch)) { + if (i == (styler.GetStartSegment() + 1)) { + // Special variable: $(, $_ etc. + styler.ColourTo(i, state); + state = SCE_PL_DEFAULT; + } else { + styler.ColourTo(i - 1, state); + state = SCE_PL_DEFAULT; + goto restartLexer; + } + } + } else if (state == SCE_PL_REGEX + || state == SCE_PL_STRING_QR + ) { + if (!Quote.Up && !isspacechar(ch)) { + Quote.Open(ch); + } else if (ch == '\\' && Quote.Up != '\\') { + // SG: Is it save to skip *every* escaped char? + i++; + ch = chNext; + chNext = styler.SafeGetCharAt(i + 1); + } else { + if (ch == Quote.Down /*&& chPrev != '\\'*/) { + Quote.Count--; + if (Quote.Count == 0) { + Quote.Rep--; + if (Quote.Up == Quote.Down) { + Quote.Count++; + } + } + if (!isalpha(chNext)) { + if (Quote.Rep <= 0) { + styler.ColourTo(i, state); + state = SCE_PL_DEFAULT; + ch = ' '; + } + } + } else if (ch == Quote.Up /*&& chPrev != '\\'*/) { + Quote.Count++; + } else if (!isascii(chNext) || !isalpha(chNext)) { + if (Quote.Rep <= 0) { + styler.ColourTo(i, state); + state = SCE_PL_DEFAULT; + ch = ' '; + } + } + } + } else if (state == SCE_PL_REGSUBST) { + if (!Quote.Up && !isspacechar(ch)) { + Quote.Open(ch); + } else if (ch == '\\' && Quote.Up != '\\') { + // SG: Is it save to skip *every* escaped char? + i++; + ch = chNext; + chNext = styler.SafeGetCharAt(i + 1); + } else { + if (Quote.Count == 0 && Quote.Rep == 1) { + /* We matched something like s(...) or tr{...} + * and are looking for the next matcher characters, + * which could be either bracketed ({...}) or non-bracketed + * (/.../). + * + * Number-signs are problematic. If they occur after + * the close of the first part, treat them like + * a Quote.Up char, even if they actually start comments. + * + * If we find an alnum, we end the regsubst, and punt. + * + * Eric Promislow ericp@activestate.com Aug 9,2000 + */ + if (isspacechar(ch)) { + // Keep going + } + else if (!isascii(ch) || isalnum(ch)) { + styler.ColourTo(i, state); + state = SCE_PL_DEFAULT; + ch = ' '; + } else { + Quote.Open(ch); + } + } else if (ch == Quote.Down /*&& chPrev != '\\'*/) { + Quote.Count--; + if (Quote.Count == 0) { + Quote.Rep--; + } + if (!isascii(chNext) || !isalpha(chNext)) { + if (Quote.Rep <= 0) { + styler.ColourTo(i, state); + state = SCE_PL_DEFAULT; + ch = ' '; + } + } + if (Quote.Up == Quote.Down) { + Quote.Count++; + } + } else if (ch == Quote.Up /*&& chPrev != '\\'*/) { + Quote.Count++; + } else if (!isascii(chNext) || !isalpha(chNext)) { + if (Quote.Rep <= 0) { + styler.ColourTo(i, state); + state = SCE_PL_DEFAULT; + ch = ' '; + } + } + } + } else if (state == SCE_PL_STRING_Q + || state == SCE_PL_STRING_QQ + || state == SCE_PL_STRING_QX + || state == SCE_PL_STRING_QW + || state == SCE_PL_STRING + || state == SCE_PL_CHARACTER + || state == SCE_PL_BACKTICKS + ) { + if (!Quote.Down && !isspacechar(ch)) { + Quote.Open(ch); + } else if (ch == '\\' && Quote.Up != '\\') { + i++; + ch = chNext; + chNext = styler.SafeGetCharAt(i + 1); + } else if (ch == Quote.Down) { + Quote.Count--; + if (Quote.Count == 0) { + Quote.Rep--; + if (Quote.Rep <= 0) { + styler.ColourTo(i, state); + state = SCE_PL_DEFAULT; + ch = ' '; + } + if (Quote.Up == Quote.Down) { + Quote.Count++; + } + } + } else if (ch == Quote.Up) { + Quote.Count++; + } + } + } + if (state == SCE_PL_ERROR) { + break; + } + chPrev = ch; + } + styler.ColourTo(lengthDoc - 1, state); +} + +static bool IsCommentLine(int line, Accessor &styler) { + int pos = styler.LineStart(line); + int eol_pos = styler.LineStart(line + 1) - 1; + for (int i = pos; i < eol_pos; i++) { + char ch = styler[i]; + int style = styler.StyleAt(i); + if (ch == '#' && style == SCE_PL_COMMENTLINE) + return true; + else if (ch != ' ' && ch != '\t') + return false; + } + return false; +} + +static void FoldPerlDoc(unsigned int startPos, int length, int, WordList *[], + Accessor &styler) { + bool foldComment = styler.GetPropertyInt("fold.comment") != 0; + bool foldCompact = styler.GetPropertyInt("fold.compact", 1) != 0; + // Custom folding of POD and packages + bool foldPOD = styler.GetPropertyInt("fold.perl.pod", 1) != 0; + bool foldPackage = styler.GetPropertyInt("fold.perl.package", 1) != 0; + unsigned int endPos = startPos + length; + int visibleChars = 0; + int lineCurrent = styler.GetLine(startPos); + int levelPrev = SC_FOLDLEVELBASE; + if (lineCurrent > 0) + levelPrev = styler.LevelAt(lineCurrent - 1) >> 16; + int levelCurrent = levelPrev; + char chNext = styler[startPos]; + char chPrev = styler.SafeGetCharAt(startPos - 1); + int styleNext = styler.StyleAt(startPos); + // Used at end of line to determine if the line was a package definition + bool isPackageLine = false; + bool isPodHeading = false; + for (unsigned int i = startPos; i < endPos; i++) { + char ch = chNext; + chNext = styler.SafeGetCharAt(i + 1); + int style = styleNext; + styleNext = styler.StyleAt(i + 1); + bool atEOL = (ch == '\r' && chNext != '\n') || (ch == '\n'); + bool atLineStart = isEOLChar(chPrev) || i == 0; + // Comment folding + if (foldComment && atEOL && IsCommentLine(lineCurrent, styler)) + { + if (!IsCommentLine(lineCurrent - 1, styler) + && IsCommentLine(lineCurrent + 1, styler)) + levelCurrent++; + else if (IsCommentLine(lineCurrent - 1, styler) + && !IsCommentLine(lineCurrent+1, styler)) + levelCurrent--; + } + if (style == SCE_C_OPERATOR) { + if (ch == '{') { + levelCurrent++; + } else if (ch == '}') { + levelCurrent--; + } + } + // Custom POD folding + if (foldPOD && atLineStart) { + int stylePrevCh = (i) ? styler.StyleAt(i - 1):SCE_PL_DEFAULT; + if (style == SCE_PL_POD) { + if (stylePrevCh != SCE_PL_POD && stylePrevCh != SCE_PL_POD_VERB) + levelCurrent++; + else if (styler.Match(i, "=cut")) + levelCurrent--; + else if (styler.Match(i, "=head")) + isPodHeading = true; + } else if (style == SCE_PL_DATASECTION) { + if (ch == '=' && isalpha(chNext) && levelCurrent == SC_FOLDLEVELBASE) + levelCurrent++; + else if (styler.Match(i, "=cut") && levelCurrent > SC_FOLDLEVELBASE) + levelCurrent--; + else if (styler.Match(i, "=head")) + isPodHeading = true; + // if package used or unclosed brace, level > SC_FOLDLEVELBASE! + // reset needed as level test is vs. SC_FOLDLEVELBASE + else if (styler.Match(i, "__END__")) + levelCurrent = SC_FOLDLEVELBASE; + } + } + // Custom package folding + if (foldPackage && atLineStart) { + if (style == SCE_PL_WORD && styler.Match(i, "package")) { + isPackageLine = true; + } + } + + if (atEOL) { + int lev = levelPrev; + if (isPodHeading) { + lev = levelPrev - 1; + lev |= SC_FOLDLEVELHEADERFLAG; + isPodHeading = false; + } + // Check if line was a package declaration + // because packages need "special" treatment + if (isPackageLine) { + lev = SC_FOLDLEVELBASE | SC_FOLDLEVELHEADERFLAG; + levelCurrent = SC_FOLDLEVELBASE + 1; + isPackageLine = false; + } + lev |= levelCurrent << 16; + if (visibleChars == 0 && foldCompact) + lev |= SC_FOLDLEVELWHITEFLAG; + if ((levelCurrent > levelPrev) && (visibleChars > 0)) + lev |= SC_FOLDLEVELHEADERFLAG; + if (lev != styler.LevelAt(lineCurrent)) { + styler.SetLevel(lineCurrent, lev); + } + lineCurrent++; + levelPrev = levelCurrent; + visibleChars = 0; + } + if (!isspacechar(ch)) + visibleChars++; + chPrev = ch; + } + // Fill in the real level of the next line, keeping the current flags as they will be filled in later + int flagsNext = styler.LevelAt(lineCurrent) & ~SC_FOLDLEVELNUMBERMASK; + styler.SetLevel(lineCurrent, levelPrev | flagsNext); +} + +static const char * const perlWordListDesc[] = { + "Keywords", + 0 +}; + +LexerModule lmPerl(SCLEX_PERL, ColourisePerlDoc, "perl", FoldPerlDoc, perlWordListDesc); + |