diff options
Diffstat (limited to 'docs/tool/Modules/NaturalDocs/Languages/Perl.pm')
| -rw-r--r-- | docs/tool/Modules/NaturalDocs/Languages/Perl.pm | 1370 |
1 files changed, 1370 insertions, 0 deletions
diff --git a/docs/tool/Modules/NaturalDocs/Languages/Perl.pm b/docs/tool/Modules/NaturalDocs/Languages/Perl.pm new file mode 100644 index 00000000..8817aadc --- /dev/null +++ b/docs/tool/Modules/NaturalDocs/Languages/Perl.pm @@ -0,0 +1,1370 @@ +############################################################################### +# +# Class: NaturalDocs::Languages::Perl +# +############################################################################### +# +# A subclass to handle the language variations of Perl. +# +# +# Topic: Language Support +# +# Supported: +# +# - Packages +# - Inheritance via "use base" and "@ISA =". +# - Functions +# - Variables +# +# Not supported yet: +# +# - Constants +# +############################################################################### + +# This file is part of Natural Docs, which is Copyright (C) 2003-2008 Greg Valure +# Natural Docs is licensed under the GPL + +use strict; +use integer; + +package NaturalDocs::Languages::Perl; + +use base 'NaturalDocs::Languages::Advanced'; + + +# +# array: hereDocTerminators +# An array of active Here Doc terminators, or an empty array if not active. Each entry is an arrayref of tokens. The entries +# must appear in the order they must appear in the source. +# +my @hereDocTerminators; + + + +############################################################################### +# Group: Interface Functions + + +# +# Function: PackageSeparator +# Returns the package separator symbol. +# +sub PackageSeparator + { return '::'; }; + +# +# Function: EnumValues +# Returns the <EnumValuesType> that describes how the language handles enums. +# +sub EnumValues + { return ::ENUM_GLOBAL(); }; + + +# +# Function: ParseFile +# +# Parses the passed source file, sending comments acceptable for documentation to <NaturalDocs::Parser->OnComment()>. +# +# Parameters: +# +# sourceFile - The name of the source file to parse. +# topicList - A reference to the list of <NaturalDocs::Parser::ParsedTopics> being built by the file. +# +# Returns: +# +# The array ( autoTopics, scopeRecord ). +# +# autoTopics - An arrayref of automatically generated topics from the file, or undef if none. +# scopeRecord - An arrayref of <NaturalDocs::Languages::Advanced::ScopeChanges>, or undef if none. +# +sub ParseFile #(sourceFile, topicsList) + { + my ($self, $sourceFile, $topicsList) = @_; + + @hereDocTerminators = ( ); + + # The regular block comment symbols are undef because they're all potentially JavaDoc comments. PreprocessFile() will + # handle translating things like =begin naturaldocs and =begin javadoc to =begin nd. + $self->ParseForCommentsAndTokens($sourceFile, [ '#' ], undef, [ '##' ], [ '=begin nd', '=end nd' ]); + + my $tokens = $self->Tokens(); + my $index = 0; + my $lineNumber = 1; + + while ($index < scalar @$tokens) + { + if ($self->TryToSkipWhitespace(\$index, \$lineNumber) || + $self->TryToGetPackage(\$index, \$lineNumber) || + $self->TryToGetBase(\$index, \$lineNumber) || + $self->TryToGetFunction(\$index, \$lineNumber) || + $self->TryToGetVariable(\$index, \$lineNumber) ) + { + # The functions above will handle everything. + } + + elsif ($tokens->[$index] eq '{') + { + $self->StartScope('}', $lineNumber, undef); + $index++; + } + + elsif ($tokens->[$index] eq '}') + { + if ($self->ClosingScopeSymbol() eq '}') + { $self->EndScope($lineNumber); }; + + $index++; + } + + elsif (lc($tokens->[$index]) eq 'eval') + { + # We want to skip the token in this case instead of letting it fall to SkipRestOfStatement. This allows evals with braces + # to be treated like normal floating braces. + $index++; + } + + else + { + $self->SkipRestOfStatement(\$index, \$lineNumber); + }; + }; + + + # Don't need to keep these around. + $self->ClearTokens(); + + return ( $self->AutoTopics(), $self->ScopeRecord() ); + }; + + +# +# Function: PreprocessFile +# +# Overridden to support "=begin nd" and similar. +# +# - "=begin [nd|naturaldocs|natural docs|jd|javadoc|java doc]" all translate to "=begin nd". +# - "=[nd|naturaldocs|natural docs]" also translate to "=begin nd". +# - "=end [nd|naturaldocs|natural docs|jd|javadoc]" all translate to "=end nd". +# - "=cut" from a ND block translates into "=end nd", but the next line will be altered to begin with "(NDPODBREAK)". This is +# so if there is POD leading into ND which ends with a cut, the parser can still end the original POD because the end ND line +# would have been removed. Remember, <NaturalDocs::Languages::Advanced->ParseForCommentsAndTokens()> removes +# Natural Docs-worthy comments to save parsing time. +# - "=pod begin nd" and "=pod end nd" are supported for compatibility with ND 1.32 and earlier, even though the syntax is a +# mistake. +# - It also supports the wrong plural forms, so naturaldoc/natural doc/javadocs/java docs will work. +# +sub PreprocessFile #(lines) + { + my ($self, $lines) = @_; + + my $inNDPOD = 0; + my $mustBreakPOD = 0; + + for (my $i = 0; $i < scalar @$lines; $i++) + { + if ($lines->[$i] =~ /^\=(?:(?:pod[ \t]+)?begin[ \t]+)?(?:nd|natural[ \t]*docs?|jd|java[ \t]*docs?)[ \t]*$/i) + { + $lines->[$i] = '=begin nd'; + $inNDPOD = 1; + $mustBreakPOD = 0; + } + elsif ($lines->[$i] =~ /^\=(?:pod[ \t]+)end[ \t]+(?:nd|natural[ \t]*docs?|jd|javadocs?)[ \t]*$/i) + { + $lines->[$i] = '=end nd'; + $inNDPOD = 0; + $mustBreakPOD = 0; + } + elsif ($lines->[$i] =~ /^\=cut[ \t]*$/i) + { + if ($inNDPOD) + { + $lines->[$i] = '=end nd'; + $inNDPOD = 0; + $mustBreakPOD = 1; + }; + } + elsif ($mustBreakPOD) + { + $lines->[$i] = '(NDPODBREAK)' . $lines->[$i]; + $mustBreakPOD = 0; + }; + }; + }; + + + +############################################################################### +# Group: Statement Parsing Functions +# All functions here assume that the current position is at the beginning of a statement. +# +# Note for developers: I am well aware that the code in these functions do not check if we're past the end of the tokens as +# often as it should. We're making use of the fact that Perl will always return undef in these cases to keep the code simpler. + + +# +# Function: TryToGetPackage +# +# Determines whether the position is at a package declaration statement, and if so, generates a topic for it, skips it, and +# returns true. +# +sub TryToGetPackage #(indexRef, lineNumberRef) + { + my ($self, $indexRef, $lineNumberRef) = @_; + my $tokens = $self->Tokens(); + + if (lc($tokens->[$$indexRef]) eq 'package') + { + my $index = $$indexRef + 1; + my $lineNumber = $$lineNumberRef; + + if (!$self->TryToSkipWhitespace(\$index, \$lineNumber)) + { return undef; }; + + my $name; + + while ($tokens->[$index] =~ /^[a-z_\:]/i) + { + $name .= $tokens->[$index]; + $index++; + }; + + if (!defined $name) + { return undef; }; + + my $autoTopic = NaturalDocs::Parser::ParsedTopic->New(::TOPIC_CLASS(), $name, + undef, undef, + undef, + undef, undef, $$lineNumberRef); + $self->AddAutoTopic($autoTopic); + + NaturalDocs::Parser->OnClass($autoTopic->Symbol()); + + $self->SetPackage($autoTopic->Symbol(), $$lineNumberRef); + + $$indexRef = $index; + $$lineNumberRef = $lineNumber; + $self->SkipRestOfStatement($indexRef, $lineNumberRef); + + return 1; + }; + + return undef; + }; + + +# +# Function: TryToGetBase +# +# Determines whether the position is at a package base declaration statement, and if so, calls +# <NaturalDocs::Parser->OnClassParent()>. +# +# Supported Syntaxes: +# +# > use base [list of strings] +# > @ISA = [list of strings] +# > @[package]::ISA = [list of strings] +# > our @ISA = [list of strings] +# +sub TryToGetBase #(indexRef, lineNumberRef) + { + my ($self, $indexRef, $lineNumberRef) = @_; + my $tokens = $self->Tokens(); + + my ($index, $lineNumber, $class, $parents); + + if (lc($tokens->[$$indexRef]) eq 'use') + { + $index = $$indexRef + 1; + $lineNumber = $$lineNumberRef; + + if (!$self->TryToSkipWhitespace(\$index, \$lineNumber) || + lc($tokens->[$index]) ne 'base') + { return undef; } + + $index++; + $self->TryToSkipWhitespace(\$index, \$lineNumber); + + $parents = $self->TryToGetListOfStrings(\$index, \$lineNumber); + } + + else + { + $index = $$indexRef; + $lineNumber = $$lineNumberRef; + + if (lc($tokens->[$index]) eq 'our') + { + $index++; + $self->TryToSkipWhitespace(\$index, \$lineNumber); + }; + + if ($tokens->[$index] eq '@') + { + $index++; + + while ($index < scalar @$tokens) + { + if ($tokens->[$index] eq 'ISA') + { + $index++; + $self->TryToSkipWhitespace(\$index, \$lineNumber); + + if ($tokens->[$index] eq '=') + { + $index++; + $self->TryToSkipWhitespace(\$index, \$lineNumber); + + $parents = $self->TryToGetListOfStrings(\$index, \$lineNumber); + } + else + { last; }; + } + + # If token isn't ISA... + elsif ($tokens->[$index] =~ /^[a-z0-9_:]/i) + { + $class .= $tokens->[$index]; + $index++; + } + else + { last; }; + }; + }; + }; + + if (defined $parents) + { + if (defined $class) + { + $class =~ s/::$//; + my @classIdentifiers = split(/::/, $class); + $class = NaturalDocs::SymbolString->Join(@classIdentifiers); + } + else + { $class = $self->CurrentScope(); }; + + foreach my $parent (@$parents) + { + my @parentIdentifiers = split(/::/, $parent); + my $parentSymbol = NaturalDocs::SymbolString->Join(@parentIdentifiers); + + NaturalDocs::Parser->OnClassParent($class, $parentSymbol, undef, undef, ::RESOLVE_ABSOLUTE()); + }; + + $$indexRef = $index; + $$lineNumberRef = $lineNumber; + $self->SkipRestOfStatement($indexRef, $lineNumberRef); + + return 1; + } + else + { return undef; }; + }; + + +# +# Function: TryToGetFunction +# +# Determines whether the position is at a function declaration statement, and if so, generates a topic for it, skips it, and +# returns true. +# +sub TryToGetFunction #(indexRef, lineNumberRef) + { + my ($self, $indexRef, $lineNumberRef) = @_; + my $tokens = $self->Tokens(); + + if ( lc($tokens->[$$indexRef]) eq 'sub') + { + my $prototypeStart = $$indexRef; + my $prototypeStartLine = $$lineNumberRef; + my $prototypeEnd = $$indexRef + 1; + my $prototypeEndLine = $$lineNumberRef; + + if ( !$self->TryToSkipWhitespace(\$prototypeEnd, \$prototypeEndLine) || + $tokens->[$prototypeEnd] !~ /^[a-z_]/i ) + { return undef; }; + + my $name = $tokens->[$prototypeEnd]; + $prototypeEnd++; + + # We parsed 'sub [name]'. Now keep going until we find a semicolon or a brace. + + for (;;) + { + if ($prototypeEnd >= scalar @$tokens) + { return undef; } + + # End if we find a semicolon, since it means we found a predeclaration rather than an actual function. + elsif ($tokens->[$prototypeEnd] eq ';') + { return undef; } + + elsif ($tokens->[$prototypeEnd] eq '{') + { + # Found it! + + my $prototype = $self->NormalizePrototype( $self->CreateString($prototypeStart, $prototypeEnd) ); + + $self->AddAutoTopic(NaturalDocs::Parser::ParsedTopic->New(::TOPIC_FUNCTION(), $name, + $self->CurrentScope(), undef, + $prototype, + undef, undef, $prototypeStartLine)); + + $$indexRef = $prototypeEnd; + $$lineNumberRef = $prototypeEndLine; + + $self->SkipRestOfStatement($indexRef, $lineNumberRef); + + return 1; + } + + else + { $self->GenericSkip(\$prototypeEnd, \$prototypeEndLine, 0, 1); }; + }; + } + else + { return undef; }; + }; + + +# +# Function: TryToGetVariable +# +# Determines if the position is at a variable declaration statement, and if so, generates a topic for it, skips it, and returns +# true. +# +# Supported Syntaxes: +# +# - Supports variables declared with "my", "our", and "local". +# - Supports multiple declarations in one statement, such as "my ($x, $y);". +# - Supports types and attributes. +# +sub TryToGetVariable #(indexRef, lineNumberRef) + { + my ($self, $indexRef, $lineNumberRef) = @_; + my $tokens = $self->Tokens(); + + my $firstToken = lc( $tokens->[$$indexRef] ); + + if ($firstToken eq 'my' || $firstToken eq 'our' || $firstToken eq 'local') + { + my $prototypeStart = $$indexRef; + my $prototypeStartLine = $$lineNumberRef; + my $prototypeEnd = $$indexRef + 1; + my $prototypeEndLine = $$lineNumberRef; + + $self->TryToSkipWhitespace(\$prototypeEnd, \$prototypeEndLine); + + + # Get the type if present. + + my $type; + + if ($tokens->[$prototypeEnd] =~ /^[a-z\:]/i) + { + do + { + $type .= $tokens->[$prototypeEnd]; + $prototypeEnd++; + } + while ($tokens->[$prototypeEnd] =~ /^[a-z\:]/i); + + if (!$self->TryToSkipWhitespace(\$prototypeEnd, \$prototypeEndLine)) + { return undef; }; + }; + + + # Get the name, or possibly names. + + if ($tokens->[$prototypeEnd] eq '(') + { + # If there's multiple variables, we'll need to build a custom prototype for each one. $firstToken already has the + # declaring word. We're going to store each name in @names, and we're going to use $prototypeStart and + # $prototypeEnd to capture any properties appearing after the list. + + my $name; + my @names; + my $hasComma = 0; + + $prototypeStart = $prototypeEnd + 1; + $prototypeStartLine = $prototypeEndLine; + + for (;;) + { + $self->TryToSkipWhitespace(\$prototypeStart, \$prototypeStartLine); + + $name = $self->TryToGetVariableName(\$prototypeStart, \$prototypeStartLine); + + if (!defined $name) + { return undef; }; + + push @names, $name; + + $self->TryToSkipWhitespace(\$prototypeStart, \$prototypeStartLine); + + # We can have multiple commas in a row. We can also have trailing commas. However, the parenthesis must + # not start with a comma or be empty, hence this logic does not appear earlier. + while ($tokens->[$prototypeStart] eq ',') + { + $prototypeStart++; + $self->TryToSkipWhitespace(\$prototypeStart, \$prototypeStartLine); + + $hasComma = 1; + } + + if ($tokens->[$prototypeStart] eq ')') + { + $prototypeStart++; + last; + } + elsif (!$hasComma) + { return undef; }; + }; + + + # Now find the end of the prototype. + + $prototypeEnd = $prototypeStart; + $prototypeEndLine = $prototypeStartLine; + + while ($prototypeEnd < scalar @$tokens && + $tokens->[$prototypeEnd] !~ /^[\;\=]/) + { + $prototypeEnd++; + }; + + + my $prototypePrefix = $firstToken . ' '; + if (defined $type) + { $prototypePrefix .= $type . ' '; }; + + my $prototypeSuffix = ' ' . $self->CreateString($prototypeStart, $prototypeEnd); + + foreach $name (@names) + { + my $prototype = $self->NormalizePrototype( $prototypePrefix . $name . $prototypeSuffix ); + + $self->AddAutoTopic(NaturalDocs::Parser::ParsedTopic->New(::TOPIC_VARIABLE(), $name, + $self->CurrentScope(), undef, + $prototype, + undef, undef, $prototypeStartLine)); + }; + + $self->SkipRestOfStatement(\$prototypeEnd, \$prototypeEndLine); + + $$indexRef = $prototypeEnd; + $$lineNumberRef = $prototypeEndLine; + } + + else # no parenthesis + { + my $name = $self->TryToGetVariableName(\$prototypeEnd, \$prototypeEndLine); + + if (!defined $name) + { return undef; }; + + while ($prototypeEnd < scalar @$tokens && + $tokens->[$prototypeEnd] !~ /^[\;\=]/) + { + $prototypeEnd++; + }; + + my $prototype = $self->NormalizePrototype( $self->CreateString($prototypeStart, $prototypeEnd) ); + + $self->AddAutoTopic(NaturalDocs::Parser::ParsedTopic->New(::TOPIC_VARIABLE(), $name, + $self->CurrentScope(), undef, + $prototype, + undef, undef, $prototypeStartLine)); + + $self->SkipRestOfStatement(\$prototypeEnd, \$prototypeEndLine); + + $$indexRef = $prototypeEnd; + $$lineNumberRef = $prototypeEndLine; + }; + + return 1; + } + else + { return undef; }; + }; + + +# +# Function: TryToGetVariableName +# +# Determines if the position is at a variable name, and if so, skips it and returns the name. +# +sub TryToGetVariableName #(indexRef, lineNumberRef) + { + my ($self, $indexRef, $lineNumberRef) = @_; + my $tokens = $self->Tokens(); + + my $name; + + if ($tokens->[$$indexRef] =~ /^[\$\@\%\*]/) + { + $name .= $tokens->[$$indexRef]; + $$indexRef++; + + $self->TryToSkipWhitespace($indexRef, $lineNumberRef); + + if ($tokens->[$$indexRef] =~ /^[a-z_]/i) + { + $name .= $tokens->[$$indexRef]; + $$indexRef++; + } + else + { return undef; }; + }; + + return $name; + }; + + +# +# Function: TryToGetListOfStrings +# +# Attempts to retrieve a list of strings from the current position. Returns an arrayref of them if any are found, or undef if none. +# It stops the moment it reaches a non-string, so "string1, variable, string2" will only return string1. +# +# Supported Syntaxes: +# +# - Supports parenthesis. +# - Supports all string forms supported by <TryToSkipString()>. +# - Supports qw() string arrays. +# +sub TryToGetListOfStrings #(indexRef, lineNumberRef) + { + my ($self, $indexRef, $lineNumberRef) = @_; + my $tokens = $self->Tokens(); + + my $parenthesis = 0; + my $strings; + + while ($$indexRef < scalar @$tokens) + { + # We'll tolerate parenthesis. + if ($tokens->[$$indexRef] eq '(') + { + $$indexRef++; + $parenthesis++; + } + elsif ($tokens->[$$indexRef] eq ')') + { + if ($parenthesis == 0) + { last; }; + + $$indexRef++; + $parenthesis--; + } + elsif ($tokens->[$$indexRef] eq ',') + { + $$indexRef++; + } + else + { + my ($startContent, $endContent); + my $symbolIndex = $$indexRef; + + if ($self->TryToSkipString($indexRef, $lineNumberRef, \$startContent, \$endContent)) + { + my $content = $self->CreateString($startContent, $endContent); + + if (!defined $strings) + { $strings = [ ]; }; + + if (lc($tokens->[$symbolIndex]) eq 'qw') + { + $content =~ tr/ \t\n/ /s; + $content =~ s/^ //; + + my @qwStrings = split(/ /, $content); + + push @$strings, @qwStrings; + } + else + { + push @$strings, $content; + }; + } + else + { last; }; + }; + + $self->TryToSkipWhitespace($indexRef, $lineNumberRef); + }; + + return $strings; + }; + + +############################################################################### +# Group: Low Level Parsing Functions + + +# +# Function: GenericSkip +# +# Advances the position one place through general code. +# +# - If the position is on a comment or string, it will skip it completely. +# - If the position is on an opening symbol, it will skip until the past the closing symbol. +# - If the position is on a regexp or quote-like operator, it will skip it completely. +# - If the position is on a backslash, it will skip it and the following token. +# - If the position is on whitespace (including comments), it will skip it completely. +# - Otherwise it skips one token. +# +# Parameters: +# +# indexRef - A reference to the current index. +# lineNumberRef - A reference to the current line number. +# noRegExps - If set, does not test for regular expressions. +# +sub GenericSkip #(indexRef, lineNumberRef, noRegExps) + { + my ($self, $indexRef, $lineNumberRef, $noRegExps, $allowStringedClosingParens) = @_; + my $tokens = $self->Tokens(); + + if ($tokens->[$$indexRef] eq "\\" && $$indexRef + 1 < scalar @$tokens && $tokens->[$$indexRef+1] ne "\n") + { $$indexRef += 2; } + + # Note that we don't want to count backslashed ()[]{} since they could be in regexps. Also, ()[] are valid variable names + # when preceded by a string. + + # We can ignore the scope stack because we're just skipping everything without parsing, and we need recursion anyway. + elsif ($tokens->[$$indexRef] eq '{' && !$self->IsBackslashed($$indexRef)) + { + $$indexRef++; + $self->GenericSkipUntilAfter($indexRef, $lineNumberRef, '}', $noRegExps, $allowStringedClosingParens); + } + elsif ($tokens->[$$indexRef] eq '(' && !$self->IsBackslashed($$indexRef) && !$self->IsStringed($$indexRef)) + { + # Temporarily allow stringed closing parenthesis if it looks like we're in an anonymous function declaration with Perl's + # cheap version of prototypes, such as "my $_declare = sub($) {}". + my $tempAllowStringedClosingParens = $allowStringedClosingParens; + if (!$allowStringedClosingParens) + { + my $tempIndex = $$indexRef - 1; + if ($tempIndex >= 0 && $tokens->[$tempIndex] =~ /^[ \t]/) + { $tempIndex--; } + if ($tempIndex >= 0 && $tokens->[$tempIndex] eq 'sub') + { $tempAllowStringedClosingParens = 1; } + } + + $$indexRef++; + + do + { $self->GenericSkipUntilAfter($indexRef, $lineNumberRef, ')', $noRegExps, $tempAllowStringedClosingParens); } + while ($$indexRef < scalar @$tokens && $self->IsStringed($$indexRef - 1) && !$tempAllowStringedClosingParens); + } + elsif ($tokens->[$$indexRef] eq '[' && !$self->IsBackslashed($$indexRef) && !$self->IsStringed($$indexRef)) + { + $$indexRef++; + + do + { $self->GenericSkipUntilAfter($indexRef, $lineNumberRef, ']', $noRegExps, $allowStringedClosingParens); } + while ($$indexRef < scalar @$tokens && $self->IsStringed($$indexRef - 1)); + } + + elsif ($self->TryToSkipWhitespace($indexRef, $lineNumberRef) || + $self->TryToSkipString($indexRef, $lineNumberRef) || + $self->TryToSkipHereDocDeclaration($indexRef, $lineNumberRef) || + (!$noRegExps && $self->TryToSkipRegexp($indexRef, $lineNumberRef) ) ) + { + } + + else + { $$indexRef++; }; + }; + + +# +# Function: GenericSkipUntilAfter +# +# Advances the position via <GenericSkip()> until a specific token is reached and passed. +# +sub GenericSkipUntilAfter #(indexRef, lineNumberRef, token, noRegExps, allowStringedClosingParens) + { + my ($self, $indexRef, $lineNumberRef, $token, $noRegExps, $allowStringedClosingParens) = @_; + my $tokens = $self->Tokens(); + + while ($$indexRef < scalar @$tokens && $tokens->[$$indexRef] ne $token) + { $self->GenericSkip($indexRef, $lineNumberRef, $noRegExps, $allowStringedClosingParens); }; + + if ($tokens->[$$indexRef] eq "\n") + { $$lineNumberRef++; }; + $$indexRef++; + }; + + +# +# Function: GenericRegexpSkip +# +# Advances the position one place through regexp code. +# +# - If the position is on an opening symbol, it will skip until the past the closing symbol. +# - If the position is on a backslash, it will skip it and the following token. +# - If the position is on whitespace (not including comments), it will skip it completely. +# - Otherwise it skips one token. +# +# Also differs from <GenericSkip()> in that the parenthesis in $( and $) do count against the scope, where they wouldn't +# normally. +# +# Parameters: +# +# indexRef - A reference to the current index. +# lineNumberRef - A reference to the current line number. +# inBrackets - Whether we're in brackets or not. If true, we don't care about matching braces and parenthesis. +# +sub GenericRegexpSkip #(indexRef, lineNumberRef, inBrackets) + { + my ($self, $indexRef, $lineNumberRef, $inBrackets) = @_; + my $tokens = $self->Tokens(); + + if ($tokens->[$$indexRef] eq "\\" && $$indexRef + 1 < scalar @$tokens && $tokens->[$$indexRef+1] ne "\n") + { $$indexRef += 2; } + + # We can ignore the scope stack because we're just skipping everything without parsing, and we need recursion anyway. + elsif ($tokens->[$$indexRef] eq '{' && !$self->IsBackslashed($$indexRef) && !$inBrackets) + { + $$indexRef++; + $self->GenericRegexpSkipUntilAfter($indexRef, $lineNumberRef, '}'); + } + elsif ($tokens->[$$indexRef] eq '(' && !$self->IsBackslashed($$indexRef) && !$inBrackets) + { + $$indexRef++; + $self->GenericRegexpSkipUntilAfter($indexRef, $lineNumberRef, ')'); + } + elsif ($tokens->[$$indexRef] eq '[' && !$self->IsBackslashed($$indexRef) && !$self->IsStringed($$indexRef)) + { + $$indexRef++; + + do + { $self->GenericRegexpSkipUntilAfter($indexRef, $lineNumberRef, ']'); } + while ($$indexRef < scalar @$tokens && $self->IsStringed($$indexRef - 1)); + } + + elsif ($tokens->[$$indexRef] eq "\n") + { + $$lineNumberRef++; + $$indexRef++; + } + + else + { $$indexRef++; }; + }; + + +# +# Function: GenericRegexpSkipUntilAfter +# +# Advances the position via <GenericRegexpSkip()> until a specific token is reached and passed. +# +sub GenericRegexpSkipUntilAfter #(indexRef, lineNumberRef, token) + { + my ($self, $indexRef, $lineNumberRef, $token) = @_; + my $tokens = $self->Tokens(); + + my $inBrackets = ( $token eq ']' ); + + while ($$indexRef < scalar @$tokens && $tokens->[$$indexRef] ne $token) + { $self->GenericRegexpSkip($indexRef, $lineNumberRef, $inBrackets); }; + + if ($tokens->[$$indexRef] eq "\n") + { $$lineNumberRef++; }; + $$indexRef++; + }; + + +# +# Function: SkipRestOfStatement +# +# Advances the position via <GenericSkip()> until after the end of the current statement, which is defined as a semicolon or +# a brace group. Of course, either of those appearing inside parenthesis, a nested brace group, etc. don't count. +# +sub SkipRestOfStatement #(indexRef, lineNumberRef) + { + my ($self, $indexRef, $lineNumberRef) = @_; + my $tokens = $self->Tokens(); + + while ($$indexRef < scalar @$tokens && + $tokens->[$$indexRef] ne ';' && + !($tokens->[$$indexRef] eq '{' && !$self->IsStringed($$indexRef)) ) + { + $self->GenericSkip($indexRef, $lineNumberRef); + }; + + if ($tokens->[$$indexRef] eq ';') + { $$indexRef++; } + elsif ($tokens->[$$indexRef] eq '{') + { $self->GenericSkip($indexRef, $lineNumberRef); }; + }; + + +# +# Function: TryToSkipWhitespace +# +# If the current position is on whitespace it skips them and returns true. If there are a number of these in a row, it skips them +# all. +# +# Supported Syntax: +# +# - Whitespace +# - Line break +# - All comment forms supported by <TryToSkipComment()> +# - Here Doc content +# +sub TryToSkipWhitespace #(indexRef, lineNumberRef) + { + my ($self, $indexRef, $lineNumberRef) = @_; + my $tokens = $self->Tokens(); + + my $result; + + while ($$indexRef < scalar @$tokens) + { + if ($self->TryToSkipHereDocContent($indexRef, $lineNumberRef) || + $self->TryToSkipComment($indexRef, $lineNumberRef)) + { + $result = 1; + } + elsif ($tokens->[$$indexRef] =~ /^[ \t]/) + { + $$indexRef++; + $result = 1; + } + elsif ($tokens->[$$indexRef] eq "\n") + { + $$indexRef++; + $$lineNumberRef++; + $result = 1; + } + else + { last; }; + }; + + return $result; + }; + + +# +# Function: TryToSkipComment +# If the current position is on a comment, skip past it and return true. +# +sub TryToSkipComment #(indexRef, lineNumberRef) + { + my ($self, $indexRef, $lineNumberRef) = @_; + + return ( $self->TryToSkipLineComment($indexRef, $lineNumberRef) || + $self->TryToSkipPODComment($indexRef, $lineNumberRef) ); + }; + + +# +# Function: TryToSkipLineComment +# If the current position is on a line comment symbol, skip past it and return true. +# +sub TryToSkipLineComment #(indexRef, lineNumberRef) + { + my ($self, $indexRef, $lineNumberRef) = @_; + my $tokens = $self->Tokens(); + + # Note that $#var is not a comment. + if ($tokens->[$$indexRef] eq '#' && !$self->IsStringed($$indexRef)) + { + $self->SkipRestOfLine($indexRef, $lineNumberRef); + return 1; + } + else + { return undef; }; + }; + + +# +# Function: TryToSkipPODComment +# If the current position is on a POD comment symbol, skip past it and return true. +# +sub TryToSkipPODComment #(indexRef, lineNumberRef) + { + my ($self, $indexRef, $lineNumberRef) = @_; + my $tokens = $self->Tokens(); + + # Note that whitespace is not allowed before the equals sign. It must directly start a line. + if ($tokens->[$$indexRef] eq '=' && + ( $$indexRef == 0 || $tokens->[$$indexRef - 1] eq "\n" ) && + $tokens->[$$indexRef + 1] =~ /^[a-z]/i ) + { + # Skip until =cut or (NDPODBREAK). Note that it's theoretically possible for =cut to appear without a prior POD directive. + + do + { + if ($tokens->[$$indexRef] eq '=' && lc( $tokens->[$$indexRef + 1] ) eq 'cut') + { + $self->SkipRestOfLine($indexRef, $lineNumberRef); + last; + } + elsif ($tokens->[$$indexRef] eq '(' && $$indexRef + 2 < scalar @$tokens && + $tokens->[$$indexRef+1] eq 'NDPODBREAK' && $tokens->[$$indexRef+2] eq ')') + { + $$indexRef += 3; + last; + } + else + { + $self->SkipRestOfLine($indexRef, $lineNumberRef); + }; + } + while ($$indexRef < scalar @$tokens); + + return 1; + } + + # It's also possible that (NDPODBREAK) will appear without any opening pod statement because "=begin nd" and "=cut" will + # still result in one. We need to pick off the stray (NDPODBREAK). + elsif ($tokens->[$$indexRef] eq '(' && $$indexRef + 2 < scalar @$tokens && + $tokens->[$$indexRef+1] eq 'NDPODBREAK' && $tokens->[$$indexRef+2] eq ')') + { + $$indexRef += 3; + return 1; + } + + else + { return undef; }; + }; + + +# +# Function: TryToSkipString +# If the current position is on a string delimiter, skip past the string and return true. +# +# Parameters: +# +# indexRef - A reference to the index of the position to start at. +# lineNumberRef - A reference to the line number of the position. +# startContentIndexRef - A reference to the variable in which to store the index of the first content token. May be undef. +# endContentIndexRef - A reference to the variable in which to store the index of the end of the content, which is one past +# the last content token. may be undef. +# +# Returns: +# +# Whether the position was at a string. The index, line number, and content index variabls will only be changed if true. +# +# Syntax Support: +# +# - Supports quotes, apostrophes, backticks, q(), qq(), qx(), and qw(). +# - All symbols are supported for the letter forms. +# +sub TryToSkipString #(indexRef, lineNumberRef, startContentIndexRef, endContentIndexRef) + { + my ($self, $indexRef, $lineNumberRef, $startContentIndexRef, $endContentIndexRef) = @_; + my $tokens = $self->Tokens(); + + # The three string delimiters. All three are Perl variables when preceded by a dollar sign. + if (!$self->IsStringed($$indexRef) && + ( $self->SUPER::TryToSkipString($indexRef, $lineNumberRef, '\'', '\'', $startContentIndexRef, $endContentIndexRef) || + $self->SUPER::TryToSkipString($indexRef, $lineNumberRef, '"', '"', $startContentIndexRef, $endContentIndexRef) || + $self->SUPER::TryToSkipString($indexRef, $lineNumberRef, '`', '`', $startContentIndexRef, $endContentIndexRef) ) ) + { + return 1; + } + elsif ($tokens->[$$indexRef] =~ /^(?:q|qq|qx|qw)$/i && + ($$indexRef == 0 || $tokens->[$$indexRef - 1] !~ /^[\$\%\@\*]$/)) + { + $$indexRef++; + + $self->TryToSkipWhitespace($indexRef, $lineNumberRef); + + my $openingSymbol = $tokens->[$$indexRef]; + my $closingSymbol; + + if ($openingSymbol eq '{') + { $closingSymbol = '}'; } + elsif ($openingSymbol eq '(') + { $closingSymbol = ')'; } + elsif ($openingSymbol eq '[') + { $closingSymbol = ']'; } + elsif ($openingSymbol eq '<') + { $closingSymbol = '>'; } + else + { $closingSymbol = $openingSymbol; }; + + $self->SUPER::TryToSkipString($indexRef, $lineNumberRef, $openingSymbol, $closingSymbol, + $startContentIndexRef, $endContentIndexRef); + + return 1; + } + else + { return undef; }; + }; + + +# +# Function: TryToSkipHereDocDeclaration +# +# If the current position is on a Here Doc declaration, add its terminators to <hereDocTerminators> and skip it. +# +# Syntax Support: +# +# - Supports <<EOF +# - Supports << "String" with all string forms supported by <TryToSkipString()>. +# +sub TryToSkipHereDocDeclaration #(indexRef, lineNumberRef) + { + my ($self, $indexRef, $lineNumberRef) = @_; + my $tokens = $self->Tokens(); + + my $index = $$indexRef; + my $lineNumber = $$lineNumberRef; + + if ($tokens->[$index] eq '<' && $tokens->[$index + 1] eq '<') + { + $index += 2; + my $success; + + # No whitespace allowed with the bare word. + if ($tokens->[$index] =~ /^[a-z0-9_]/i) + { + push @hereDocTerminators, [ $tokens->[$index] ]; + $index++; + $success = 1; + } + else + { + $self->TryToSkipWhitespace(\$index, \$lineNumber); + + my ($contentStart, $contentEnd); + if ($self->TryToSkipString(\$index, \$lineNumber, \$contentStart, \$contentEnd)) + { + push @hereDocTerminators, [ @{$tokens}[$contentStart..$contentEnd - 1] ]; + $success = 1; + }; + }; + + if ($success) + { + $$indexRef = $index; + $$lineNumberRef = $lineNumber; + + return 1; + }; + }; + + return 0; + }; + + +# +# Function: TryToSkipHereDocContent +# +# If the current position is at the beginning of a line and there are entries in <hereDocTerminators>, skips lines until all the +# terminators are exhausted or we reach the end of the file. +# +# Returns: +# +# Whether the position was on Here Doc content. +# +sub TryToSkipHereDocContent #(indexRef, lineNumberRef) + { + my ($self, $indexRef, $lineNumberRef) = @_; + my $tokens = $self->Tokens(); + + # We don't use IsFirstLineToken() because it really needs to be the first line token. Whitespace is not allowed. + if ($$indexRef > 0 && $tokens->[$$indexRef - 1] eq "\n") + { + my $success = (scalar @hereDocTerminators > 0); + + while (scalar @hereDocTerminators && $$indexRef < scalar @$tokens) + { + my $terminatorIndex = 0; + + while ($hereDocTerminators[0]->[$terminatorIndex] eq $tokens->[$$indexRef]) + { + $terminatorIndex++; + $$indexRef++; + }; + + if ($terminatorIndex == scalar @{$hereDocTerminators[0]} && + ($tokens->[$$indexRef] eq "\n" || ($tokens->[$$indexRef] =~ /^[ \t]/ && $tokens->[$$indexRef + 1] eq "\n")) ) + { + shift @hereDocTerminators; + $$indexRef++; + $$lineNumberRef++; + } + else + { $self->SkipRestOfLine($indexRef, $lineNumberRef); }; + }; + + return $success; + } + + else + { return 0; }; + }; + + +# +# Function: TryToSkipRegexp +# If the current position is on a regular expression or a quote-like operator, skip past it and return true. +# +# Syntax Support: +# +# - Supports //, ??, m//, qr//, s///, tr///, and y///. +# - All symbols are supported for the letter forms. +# - ?? is *not* supported because it could cause problems with ?: statements. The generic parser has a good chance of +# successfully stumbling through a regex, whereas the regex code will almost certainly see the rest of the file as part of it. +# +sub TryToSkipRegexp #(indexRef, lineNumberRef) + { + my ($self, $indexRef, $lineNumberRef) = @_; + my $tokens = $self->Tokens(); + + my $isRegexp; + + # If it's a supported character sequence that's not a variable (ex $qr)... + if ($tokens->[$$indexRef] =~ /^(?:m|qr|s|tr|y)$/i && + ($$indexRef == 0 || $tokens->[$$indexRef - 1] !~ /^[\$\%\@\*\-]$/) ) + { $isRegexp = 1; } + + elsif ($tokens->[$$indexRef] eq '/' && !$self->IsStringed($$indexRef)) + { + # This is a bit of a hack. If we find a random slash, it could be a divide operator or a bare regexp. Find the first previous + # non-whitespace token and if it's text, a closing brace, or a string, assume it's a divide operator. (Strings don't make + # much pratical sense there but a regexp would be impossible.) Otherwise assume it's a regexp. + + # We make a special consideration for split() appearing without parenthesis. If the previous token is split and it's not a + # variable, assume it is a regexp even though it fails the above test. + + my $index = $$indexRef - 1; + + while ($index >= 0 && $tokens->[$index] =~ /^(?: |\t|\n)/) + { $index--; }; + + if ($index < 0 || $tokens->[$index] !~ /^[a-zA-Z0-9_\)\]\}\'\"\`]/ || + ($tokens->[$index] =~ /^split|grep$/ && $index > 0 && $tokens->[$index-1] !~ /^[\$\%\@\*]$/) ) + { $isRegexp = 1; }; + }; + + if ($isRegexp) + { + my $operator = lc($tokens->[$$indexRef]); + my $index = $$indexRef; + my $lineNumber = $$lineNumberRef; + + if ($operator =~ /^[\?\/]/) + { $operator = 'm'; } + else + { + $index++; + + # Believe it or not, s#...# is allowed. We can't pass over number signs here. + if ($tokens->[$index] ne '#') + { $self->TryToSkipWhitespace(\$index, \$lineNumber); }; + }; + + if ($tokens->[$index] =~ /^\w/) + { return undef; }; + if ($tokens->[$index] eq '=' && $tokens->[$index+1] eq '>') + { return undef; }; + + my $openingSymbol = $tokens->[$index]; + my $closingSymbol; + + if ($openingSymbol eq '{') + { $closingSymbol = '}'; } + elsif ($openingSymbol eq '(') + { $closingSymbol = ')'; } + elsif ($openingSymbol eq '[') + { $closingSymbol = ']'; } + elsif ($openingSymbol eq '<') + { $closingSymbol = '>'; } + else + { $closingSymbol = $openingSymbol; }; + + $index++; + + $self->GenericRegexpSkipUntilAfter(\$index, \$lineNumber, $closingSymbol); + + $$indexRef = $index; + $$lineNumberRef = $lineNumber; + + if ($operator =~ /^(?:s|tr|y)$/) + { + if ($openingSymbol ne $closingSymbol) + { + $self->TryToSkipWhitespace($indexRef, $lineNumberRef); + + $openingSymbol = $tokens->[$index]; + + if ($openingSymbol eq '{') + { $closingSymbol = '}'; } + elsif ($openingSymbol eq '(') + { $closingSymbol = ')'; } + elsif ($openingSymbol eq '[') + { $closingSymbol = ']'; } + elsif ($openingSymbol eq '<') + { $closingSymbol = '>'; } + else + { $closingSymbol = $openingSymbol; }; + + $$indexRef++; + }; + + if ($operator eq 's') + { + $self->GenericSkipUntilAfter($indexRef, $lineNumberRef, $closingSymbol, 1); + } + else # ($operator eq 'tr' || $operator eq 'y') + { + while ($$indexRef < scalar @$tokens && + ($tokens->[$$indexRef] ne $closingSymbol || $self->IsBackslashed($$indexRef)) ) + { + if ($tokens->[$$indexRef] eq "\n") + { $$lineNumberRef++; }; + $$indexRef++; + }; + + $$indexRef++; + }; + }; + + # We want to skip any letters after the regexp. Otherwise something like tr/a/b/s; could have the trailing s; interpreted + # as another regexp. Whitespace is not allowed between the closing symbol and the letters. + + if ($tokens->[$$indexRef] =~ /^[a-z]/i) + { $$indexRef++; }; + + return 1; + }; + + return undef; + }; + + + +############################################################################### +# Group: Support Functions + + +# +# Function: IsStringed +# +# Returns whether the position is after a string (dollar sign) character. Returns false if it's preceded by two dollar signs so +# "if ($x == $$)" doesn't skip the closing parenthesis as stringed. +# +# Parameters: +# +# index - The index of the postition. +# +sub IsStringed #(index) + { + my ($self, $index) = @_; + my $tokens = $self->Tokens(); + + if ($index > 0 && $tokens->[$index - 1] eq '$' && !($index > 1 && $tokens->[$index - 2] eq '$')) + { return 1; } + else + { return undef; }; + }; + + +1; |