From 64dea5009b99d34e268414b7c50d3a2a53216782 Mon Sep 17 00:00:00 2001 From: Robert Einhorn Date: Sun, 15 Sep 2024 20:35:16 +0200 Subject: [PATCH 1/2] added TypeScript target and some modifications --- python/pom.xml | 2 +- python/python2_7_18/CSharp/PythonLexerBase.cs | 117 ++-- python/python2_7_18/Java/PythonLexerBase.java | 577 +++++++++--------- .../JavaScript/PythonLexerBase.js | 208 +++---- .../python2_7_18/Python3/PythonLexerBase.py | 296 +++++---- python/python2_7_18/PythonLexer.g4 | 15 +- .../TypeScript/PythonLexerBase.ts | 306 ++++++++++ python/python2_7_18/changes.txt | 4 + python/python2_7_18/desc.xml | 6 +- .../test_error_first_statement_indented.py | 2 +- .../tests/test_error_inconsistent_dedent.py | 2 +- ...test_error_tab_and_space_in_indentation.py | 2 +- .../tests/test_error_unexpected_indent.py | 2 +- .../tests/test_explicit_line_joining.py | 2 +- .../test_hidden_NEWLINE_before_comment.py | 2 +- ...ng_literal_with_newline_escape_sequence.py | 10 - python/python3_12_1/CSharp/PythonLexerBase.cs | 122 ++-- python/python3_12_1/Java/PythonLexerBase.java | 111 ++-- .../JavaScript/PythonLexerBase.js | 237 ++++--- .../python3_12_1/Python3/PythonLexerBase.py | 360 ++++++----- ....peg => Python3_12_6_official_grammar.peg} | 6 +- python/python3_12_1/PythonLexer.g4 | 25 +- python/python3_12_1/PythonParser.g4 | 6 +- python/python3_12_1/README.md | 2 +- .../TypeScript/PythonLexerBase.ts | 392 ++++++++++++ .../TypeScript/PythonParserBase.ts | 16 + python/python3_12_1/changes.txt | 7 + python/python3_12_1/desc.xml | 6 +- .../test_double_braces_in_fstring_literal.py | 9 - .../test_error_first_statement_indented.py | 2 +- .../tests/test_error_inconsistent_dedent.py | 2 +- ...test_error_tab_and_space_in_indentation.py | 2 +- .../tests/test_error_unexpected_indent.py | 2 +- .../tests/test_explicit_line_joining.py | 2 +- .../test_hidden_NEWLINE_before_comment.py | 2 +- .../test_lambda_colon_in_fstring_literal.py | 8 - ...format specification_in_fstring_literal.py | 9 - ...ng_literal_with_newline_escape_sequence.py | 10 - 38 files changed, 1703 insertions(+), 1188 deletions(-) create mode 100644 python/python2_7_18/TypeScript/PythonLexerBase.ts create mode 100644 python/python2_7_18/changes.txt delete mode 100644 python/python2_7_18/tests/test_string_literal_with_newline_escape_sequence.py rename python/python3_12_1/{Python3_12_1_official_grammar.peg => Python3_12_6_official_grammar.peg} (99%) create mode 100644 python/python3_12_1/TypeScript/PythonLexerBase.ts create mode 100644 python/python3_12_1/TypeScript/PythonParserBase.ts create mode 100644 python/python3_12_1/changes.txt delete mode 100644 python/python3_12_1/tests/test_double_braces_in_fstring_literal.py delete mode 100644 python/python3_12_1/tests/test_lambda_colon_in_fstring_literal.py delete mode 100644 python/python3_12_1/tests/test_missing format specification_in_fstring_literal.py delete mode 100644 python/python3_12_1/tests/test_string_literal_with_newline_escape_sequence.py diff --git a/python/pom.xml b/python/pom.xml index ce079ba36a..669998e1b3 100644 --- a/python/pom.xml +++ b/python/pom.xml @@ -16,6 +16,6 @@ python2 python3 python2_7_18 - python3_12_1 + python3_12 diff --git a/python/python2_7_18/CSharp/PythonLexerBase.cs b/python/python2_7_18/CSharp/PythonLexerBase.cs index 627313018d..7902984380 100644 --- a/python/python2_7_18/CSharp/PythonLexerBase.cs +++ b/python/python2_7_18/CSharp/PythonLexerBase.cs @@ -37,6 +37,7 @@ public abstract class PythonLexerBase : Lexer private Stack indentLengthStack; // A list where tokens are waiting to be loaded into the token stream private LinkedList pendingTokens; + // last pending token types private int previousPendingTokenType; private int lastPendingTokenTypeFromDefaultChannel; @@ -47,11 +48,11 @@ public abstract class PythonLexerBase : Lexer private bool wasSpaceIndentation; private bool wasTabIndentation; private bool wasIndentationMixedWithSpacesAndTabs; - private const int INVALID_LENGTH = -1; - private CommonToken curToken; // current (under processing) token - private IToken ffgToken; // following (look ahead) token + private IToken curToken; // current (under processing) token + private IToken ffgToken; // following (look ahead) token + private const int INVALID_LENGTH = -1; private const string ERR_TXT = " ERROR: "; protected PythonLexerBase(ICharStream input) : base(input) @@ -64,6 +65,20 @@ protected PythonLexerBase(ICharStream input, TextWriter output, TextWriter error this.Init(); } + public override IToken NextToken() // reading the input stream until a return EOF + { + this.CheckNextToken(); + IToken firstPendingToken = this.pendingTokens.First.Value; + this.pendingTokens.RemoveFirst(); + return firstPendingToken; // add the queued token to the token stream + } + + public override void Reset() + { + this.Init(); + base.Reset(); + } + private void Init() { this.indentLengthStack = new Stack(); @@ -78,14 +93,6 @@ private void Init() this.ffgToken = null!; } - public override IToken NextToken() // reading the input stream until a return EOF - { - this.CheckNextToken(); - IToken firstPendingToken = this.pendingTokens.First.Value; - this.pendingTokens.RemoveFirst(); - return firstPendingToken; // add the queued token to the token stream - } - private void CheckNextToken() { if (this.previousPendingTokenType != TokenConstants.EOF) @@ -113,10 +120,7 @@ private void CheckNextToken() case PythonLexer.NEWLINE: this.HandleNEWLINEtoken(); break; - case PythonLexer.STRING: - this.HandleSTRINGtoken(); - break; - case PythonLexer.ERROR_TOKEN: + case PythonLexer.ERRORTOKEN: this.ReportLexerError("token recognition error at: '" + this.curToken.Text + "'"); this.AddPendingToken(this.curToken); break; @@ -133,12 +137,12 @@ private void CheckNextToken() private void SetCurrentAndFollowingTokens() { this.curToken = this.ffgToken == null ? - new CommonToken(base.NextToken()) : - new CommonToken(this.ffgToken); + base.NextToken() : + this.ffgToken; this.ffgToken = this.curToken.Type == TokenConstants.EOF ? - this.curToken : - base.NextToken(); + this.curToken : + base.NextToken(); } // initialize the _indentLengths @@ -196,7 +200,7 @@ private void HandleNEWLINEtoken() } else { - CommonToken nlToken = new CommonToken(this.curToken); // save the current NEWLINE token + IToken nlToken = new CommonToken(this.curToken); // save the current NEWLINE token bool isLookingAhead = this.ffgToken.Type == PythonLexer.WS; if (isLookingAhead) { @@ -205,12 +209,12 @@ private void HandleNEWLINEtoken() switch (this.ffgToken.Type) { - case PythonLexer.NEWLINE: // We're before a blank line - case PythonLexer.COMMENT: // We're before a comment + case PythonLexer.NEWLINE: // We're before a blank line + case PythonLexer.COMMENT: // We're before a comment this.HideAndAddPendingToken(nlToken); if (isLookingAhead) { - this.AddPendingToken(this.curToken); // WS token + this.AddPendingToken(this.curToken); // WS token } break; default: @@ -243,7 +247,6 @@ private void HandleNEWLINEtoken() private void InsertIndentOrDedentToken(int indentLength) { - //*** https://docs.python.org/3/reference/lexical_analysis.html#indentation int prevIndentLength = this.indentLengthStack.Peek(); if (indentLength > prevIndentLength) { @@ -268,25 +271,6 @@ private void InsertIndentOrDedentToken(int indentLength) } } - private void HandleSTRINGtoken() - { - // remove the \ escape sequences from the string literal - // https://docs.python.org/3.11/reference/lexical_analysis.html#string-and-bytes-literals - string line_joinFreeStringLiteral = Regex.Replace(this.curToken.Text, @"\\\r?\n", ""); - if (this.curToken.Text.Length == line_joinFreeStringLiteral.Length) - { - this.AddPendingToken(this.curToken); - } - else - { - CommonToken originalSTRINGtoken = new CommonToken(this.curToken); // backup the original token - this.curToken.Text = line_joinFreeStringLiteral; - this.AddPendingToken(this.curToken); // add the modified token with inline string literal - this.HideAndAddPendingToken(originalSTRINGtoken); // add the original token with a hidden channel - // this inserted hidden token allows to restore the original string literal with the \ escape sequences - } - } - private void InsertTrailingTokens() { switch (this.lastPendingTokenTypeFromDefaultChannel) @@ -311,42 +295,43 @@ private void HandleEOFtoken() this.AddPendingToken(this.curToken); } - private void HideAndAddPendingToken(CommonToken cToken) + private void HideAndAddPendingToken(IToken tkn) { - cToken.Channel = TokenConstants.HiddenChannel; - this.AddPendingToken(cToken); + CommonToken ctkn = new CommonToken(tkn); + ctkn.Channel = TokenConstants.HiddenChannel; + this.AddPendingToken(ctkn); } - private void CreateAndAddPendingToken(int type, int channel, string text, IToken baseToken) + private void CreateAndAddPendingToken(int ttype, int channel, string text, IToken sampleToken) { - CommonToken cToken = new CommonToken(baseToken); - cToken.Type = type; - cToken.Channel = channel; - cToken.StopIndex = baseToken.StartIndex - 1; + CommonToken ctkn = new CommonToken(sampleToken); + ctkn.Type = ttype; + ctkn.Channel = channel; + ctkn.StopIndex = sampleToken.StartIndex - 1; - cToken.Text = text == null - ? "<" + Vocabulary.GetSymbolicName(type) + ">" + ctkn.Text = text == null + ? "<" + Vocabulary.GetSymbolicName(ttype) + ">" : text; - this.AddPendingToken(cToken); + this.AddPendingToken(ctkn); } - private void AddPendingToken(IToken token) + private void AddPendingToken(IToken tkn) { // save the last pending token type because the pendingTokens linked list can be empty by the nextToken() - this.previousPendingTokenType = token.Type; - if (token.Channel == TokenConstants.DefaultChannel) + this.previousPendingTokenType = tkn.Type; + if (tkn.Channel == TokenConstants.DefaultChannel) { this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType; } - this.pendingTokens.AddLast(token); + this.pendingTokens.AddLast(tkn); } - private int GetIndentationLength(string textWS) // the textWS may contain spaces, tabs or form feeds + private int GetIndentationLength(string indentText) // the indentText may contain spaces, tabs or form feeds { const int TAB_LENGTH = 8; // the standard number of spaces to replace a tab with spaces int length = 0; - foreach (char ch in textWS) + foreach (char ch in indentText) { switch (ch) { @@ -369,7 +354,7 @@ private int GetIndentationLength(string textWS) // the textWS may contain spaces if (!this.wasIndentationMixedWithSpacesAndTabs) { this.wasIndentationMixedWithSpacesAndTabs = true; - return PythonLexerBase.INVALID_LENGTH; // only for the first inconsistent indent + length = PythonLexerBase.INVALID_LENGTH; // only for the first inconsistent indent } } return length; @@ -384,13 +369,7 @@ private void ReportError(string errMsg) { this.ReportLexerError(errMsg); - // the ERROR_TOKEN will raise an error in the parser - this.CreateAndAddPendingToken(PythonLexer.ERROR_TOKEN, TokenConstants.DefaultChannel, PythonLexerBase.ERR_TXT + errMsg, this.ffgToken); - } - - public override void Reset() - { - this.Init(); - base.Reset(); + // the ERRORTOKEN will raise an error in the parser + this.CreateAndAddPendingToken(PythonLexer.ERRORTOKEN, TokenConstants.DefaultChannel, PythonLexerBase.ERR_TXT + errMsg, this.ffgToken); } } diff --git a/python/python2_7_18/Java/PythonLexerBase.java b/python/python2_7_18/Java/PythonLexerBase.java index d09dc1171b..2abaa1769a 100644 --- a/python/python2_7_18/Java/PythonLexerBase.java +++ b/python/python2_7_18/Java/PythonLexerBase.java @@ -27,298 +27,285 @@ of this software and associated documentation files (the "Software"), to deal * */ -import java.util.*; - -import org.antlr.v4.runtime.*; - -public abstract class PythonLexerBase extends Lexer { - // A stack that keeps track of the indentation lengths - private Deque indentLengthStack; - // A list where tokens are waiting to be loaded into the token stream - private LinkedList pendingTokens; - - // last pending token types - private int previousPendingTokenType; - private int lastPendingTokenTypeFromDefaultChannel; - - // The amount of opened parentheses, square brackets or curly braces - private int opened; - - private boolean wasSpaceIndentation; - private boolean wasTabIndentation; - private boolean wasIndentationMixedWithSpacesAndTabs; - private final int INVALID_LENGTH = -1; - - private CommonToken curToken; // current (under processing) token - private Token ffgToken; // following (look ahead) token - - private final String ERR_TXT = " ERROR: "; - - protected PythonLexerBase(CharStream input) { - super(input); - this.init(); - } - - private void init() { - this.indentLengthStack = new ArrayDeque<>(); - this.pendingTokens = new LinkedList<>(); - this.previousPendingTokenType = 0; - this.lastPendingTokenTypeFromDefaultChannel = 0; - this.opened = 0; - this.wasSpaceIndentation = false; - this.wasTabIndentation = false; - this.wasIndentationMixedWithSpacesAndTabs = false; - this.curToken = null; - this.ffgToken = null; - } - - @Override - public Token nextToken() { // reading the input stream until a return EOF - this.checkNextToken(); - return this.pendingTokens.pollFirst(); // add the queued token to the token stream - } - - private void checkNextToken() { - if (this.previousPendingTokenType != Token.EOF) { - this.setCurrentAndFollowingTokens(); - if (this.indentLengthStack.isEmpty()) { // We're at the first token - this.handleStartOfInput(); - } - - switch (this.curToken.getType()) { - case PythonLexer.LPAR: - case PythonLexer.LSQB: - case PythonLexer.LBRACE: - this.opened++; - this.addPendingToken(this.curToken); - break; - case PythonLexer.RPAR: - case PythonLexer.RSQB: - case PythonLexer.RBRACE: - this.opened--; - this.addPendingToken(this.curToken); - break; - case PythonLexer.NEWLINE: - this.handleNEWLINEtoken(); - break; - case PythonLexer.STRING: - this.handleSTRINGtoken(); - break; - case PythonLexer.ERROR_TOKEN: - this.reportLexerError("token recognition error at: '" + this.curToken.getText() + "'"); - this.addPendingToken(this.curToken); - break; - case Token.EOF: - this.handleEOFtoken(); - break; - default: - this.addPendingToken(this.curToken); - } - } - } - - private void setCurrentAndFollowingTokens() { - this.curToken = this.ffgToken == null ? - new CommonToken(super.nextToken()) : - new CommonToken(this.ffgToken); - - this.ffgToken = this.curToken.getType() == Token.EOF ? - this.curToken : - super.nextToken(); - } - - // initialize the indentLengthStack - // hide the leading NEWLINE token(s) - // if exists, find the first statement (not NEWLINE, not EOF token) that comes from the default channel - // insert a leading INDENT token if necessary - private void handleStartOfInput() { - // initialize the stack with a default 0 indentation length - this.indentLengthStack.push(0); // this will never be popped off - while (this.curToken.getType() != Token.EOF) { - if (this.curToken.getChannel() == Token.DEFAULT_CHANNEL) { - if (this.curToken.getType() == PythonLexer.NEWLINE) { - // all the NEWLINE tokens must be ignored before the first statement - this.hideAndAddPendingToken(this.curToken); - } else { // We're at the first statement - this.insertLeadingIndentToken(); - return; // continue the processing of the current token with checkNextToken() - } - } else { - this.addPendingToken(this.curToken); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token - } - this.setCurrentAndFollowingTokens(); - } // continue the processing of the EOF token with checkNextToken() - } - - private void insertLeadingIndentToken() { - if (this.previousPendingTokenType == PythonLexer.WS) { - Token prevToken = this.pendingTokens.peekLast(); // WS token - if (this.getIndentationLength(prevToken.getText()) != 0) { // there is an "indentation" before the first statement - final String errMsg = "first statement indented"; - this.reportLexerError(errMsg); - // insert an INDENT token before the first statement to raise an 'unexpected indent' error later by the parser - this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.curToken); - } - } - } - - private void handleNEWLINEtoken() { - if (this.opened > 0) { // We're in an implicit line joining, ignore the current NEWLINE token - this.hideAndAddPendingToken(this.curToken); - } else { - CommonToken nlToken = new CommonToken(this.curToken); // save the current NEWLINE token - final boolean isLookingAhead = this.ffgToken.getType() == PythonLexer.WS; - if (isLookingAhead) { - this.setCurrentAndFollowingTokens(); // set the next two tokens - } - - switch (this.ffgToken.getType()) { - case PythonLexer.NEWLINE: // We're before a blank line - case PythonLexer.COMMENT: // We're before a comment - this.hideAndAddPendingToken(nlToken); - if (isLookingAhead) { - this.addPendingToken(this.curToken); // WS token - } - break; - default: - this.addPendingToken(nlToken); - if (isLookingAhead) { // We're on whitespace(s) followed by a statement - final int indentationLength = this.ffgToken.getType() == Token.EOF ? - 0 : - this.getIndentationLength(this.curToken.getText()); - - if (indentationLength != this.INVALID_LENGTH) { - this.addPendingToken(this.curToken); // WS token - this.insertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s) - } else { - this.reportError("inconsistent use of tabs and spaces in indentation"); - } - } else { // We're at a newline followed by a statement (there is no whitespace before the statement) - this.insertIndentOrDedentToken(0); // may insert DEDENT token(s) - } - } - } - } - - private void insertIndentOrDedentToken(final int indentLength) { - int prevIndentLength = this.indentLengthStack.peek(); - if (indentLength > prevIndentLength) { - this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken); - this.indentLengthStack.push(indentLength); - } else { - while (indentLength < prevIndentLength) { // more than 1 DEDENT token may be inserted to the token stream - this.indentLengthStack.pop(); - prevIndentLength = this.indentLengthStack.peek(); - if (indentLength <= prevIndentLength) { - this.createAndAddPendingToken(PythonLexer.DEDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken); - } else { - this.reportError("inconsistent dedent"); - } - } - } - } - - private void handleSTRINGtoken() { // remove the \ escape sequences from the string literal - final String line_joinFreeStringLiteral = this.curToken.getText().replaceAll("\\\\\\r?\\n", ""); - if (this.curToken.getText().length() == line_joinFreeStringLiteral.length()) { - this.addPendingToken(this.curToken); - } else { - CommonToken originalSTRINGtoken = new CommonToken(this.curToken); // backup the original token - this.curToken.setText(line_joinFreeStringLiteral); - this.addPendingToken(this.curToken); // add the modified token with inline string literal - this.hideAndAddPendingToken(originalSTRINGtoken); // add the original token to the hidden channel - // this inserted hidden token allows to restore the original string literal with the \ escape sequences - } - } - - private void insertTrailingTokens() { - switch (this.lastPendingTokenTypeFromDefaultChannel) { - case PythonLexer.NEWLINE: - case PythonLexer.DEDENT: - break; // no trailing NEWLINE token is needed - default: - // insert an extra trailing NEWLINE token that serves as the end of the last statement - this.createAndAddPendingToken(PythonLexer.NEWLINE, Token.DEFAULT_CHANNEL, null, this.ffgToken); // ffgToken is EOF - } - this.insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed - } - - private void handleEOFtoken() { - if (this.lastPendingTokenTypeFromDefaultChannel > 0) { - // there was statement in the input (leading NEWLINE tokens are hidden) - this.insertTrailingTokens(); - } - this.addPendingToken(this.curToken); - } - - private void hideAndAddPendingToken(CommonToken cToken) { - cToken.setChannel(Token.HIDDEN_CHANNEL); - this.addPendingToken(cToken); - } - - private void createAndAddPendingToken(final int type, final int channel, final String text, Token baseToken) { - CommonToken cToken = new CommonToken(baseToken); - cToken.setType(type); - cToken.setChannel(channel); - cToken.setStopIndex(baseToken.getStartIndex() - 1); - cToken.setText(text == null - ? "<" + this.getVocabulary().getSymbolicName(type) + ">" - : text); - - this.addPendingToken(cToken); - } - - private void addPendingToken(final Token token) { - // save the last pending token type because the pendingTokens linked list can be empty by the nextToken() - this.previousPendingTokenType = token.getType(); - if (token.getChannel() == Token.DEFAULT_CHANNEL) { - this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType; - } - this.pendingTokens.addLast(token); - } - - private int getIndentationLength(final String textWS) { // the textWS may contain spaces, tabs or form feeds - final int TAB_LENGTH = 8; // the standard number of spaces to replace a tab to spaces - int length = 0; - for (char ch : textWS.toCharArray()) { - switch (ch) { - case ' ': - this.wasSpaceIndentation = true; - length += 1; - break; - case '\t': - this.wasTabIndentation = true; - length += TAB_LENGTH - (length % TAB_LENGTH); - break; - case '\f': // form feed - length = 0; - break; - } - } - - if (this.wasTabIndentation && this.wasSpaceIndentation) { - if (!(this.wasIndentationMixedWithSpacesAndTabs)) { - this.wasIndentationMixedWithSpacesAndTabs = true; - return this.INVALID_LENGTH; // only for the first inconsistent indent - } - } - return length; - } - - private void reportLexerError(final String errMsg) { - this.getErrorListenerDispatch().syntaxError(this, this.curToken, this.curToken.getLine(), this.curToken.getCharPositionInLine(), " LEXER" + this.ERR_TXT + errMsg, null); - } - - private void reportError(final String errMsg) { - this.reportLexerError(errMsg); - - // the ERROR_TOKEN will raise an error in the parser - this.createAndAddPendingToken(PythonLexer.ERROR_TOKEN, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.ffgToken); - } - - @Override - public void reset() { - this.init(); - super.reset(); - } -} + import java.util.*; + + import org.antlr.v4.runtime.*; + + public abstract class PythonLexerBase extends Lexer { + // A stack that keeps track of the indentation lengths + private Deque indentLengthStack; + // A list where tokens are waiting to be loaded into the token stream + private LinkedList pendingTokens; + + // last pending token types + private int previousPendingTokenType; + private int lastPendingTokenTypeFromDefaultChannel; + + // The amount of opened parentheses, square brackets or curly braces + private int opened; + + private boolean wasSpaceIndentation; + private boolean wasTabIndentation; + private boolean wasIndentationMixedWithSpacesAndTabs; + + private Token curToken; // current (under processing) token + private Token ffgToken; // following (look ahead) token + + private final int INVALID_LENGTH = -1; + private final String ERR_TXT = " ERROR: "; + + protected PythonLexerBase(CharStream input) { + super(input); + this.init(); + } + + @Override + public Token nextToken() { // reading the input stream until a return EOF + this.checkNextToken(); + return this.pendingTokens.pollFirst(); // add the queued token to the token stream + } + + @Override + public void reset() { + this.init(); + super.reset(); + } + + private void init() { + this.indentLengthStack = new ArrayDeque<>(); + this.pendingTokens = new LinkedList<>(); + this.previousPendingTokenType = 0; + this.lastPendingTokenTypeFromDefaultChannel = 0; + this.opened = 0; + this.wasSpaceIndentation = false; + this.wasTabIndentation = false; + this.wasIndentationMixedWithSpacesAndTabs = false; + this.curToken = null; + this.ffgToken = null; + } + + private void checkNextToken() { + if (this.previousPendingTokenType != Token.EOF) { + this.setCurrentAndFollowingTokens(); + if (this.indentLengthStack.isEmpty()) { // We're at the first token + this.handleStartOfInput(); + } + + switch (this.curToken.getType()) { + case PythonLexer.LPAR: + case PythonLexer.LSQB: + case PythonLexer.LBRACE: + this.opened++; + this.addPendingToken(this.curToken); + break; + case PythonLexer.RPAR: + case PythonLexer.RSQB: + case PythonLexer.RBRACE: + this.opened--; + this.addPendingToken(this.curToken); + break; + case PythonLexer.NEWLINE: + this.handleNEWLINEtoken(); + break; + case PythonLexer.ERRORTOKEN: + this.reportLexerError("token recognition error at: '" + this.curToken.getText() + "'"); + this.addPendingToken(this.curToken); + break; + case Token.EOF: + this.handleEOFtoken(); + break; + default: + this.addPendingToken(this.curToken); + } + } + } + + private void setCurrentAndFollowingTokens() { + this.curToken = this.ffgToken == null ? + super.nextToken() : + this.ffgToken; + + this.ffgToken = this.curToken.getType() == Token.EOF ? + this.curToken : + super.nextToken(); + } + + // initialize the indentLengthStack + // hide the leading NEWLINE token(s) + // if exists, find the first statement (not NEWLINE, not EOF token) that comes from the default channel + // insert a leading INDENT token if necessary + private void handleStartOfInput() { + // initialize the stack with a default 0 indentation length + this.indentLengthStack.push(0); // this will never be popped off + while (this.curToken.getType() != Token.EOF) { + if (this.curToken.getChannel() == Token.DEFAULT_CHANNEL) { + if (this.curToken.getType() == PythonLexer.NEWLINE) { + // all the NEWLINE tokens must be ignored before the first statement + this.hideAndAddPendingToken(this.curToken); + } else { // We're at the first statement + this.insertLeadingIndentToken(); + return; // continue the processing of the current token with checkNextToken() + } + } else { + this.addPendingToken(this.curToken); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token + } + this.setCurrentAndFollowingTokens(); + } + // continue the processing of the EOF token with checkNextToken() + } + + private void insertLeadingIndentToken() { + if (this.previousPendingTokenType == PythonLexer.WS) { + Token prevToken = this.pendingTokens.peekLast(); // WS token + if (this.getIndentationLength(prevToken.getText()) != 0) { // there is an "indentation" before the first statement + final String errMsg = "first statement indented"; + this.reportLexerError(errMsg); + // insert an INDENT token before the first statement to raise an 'unexpected indent' error later by the parser + this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.curToken); + } + } + } + + private void handleNEWLINEtoken() { + if (this.opened > 0) { // We're in an implicit line joining, ignore the current NEWLINE token + this.hideAndAddPendingToken(this.curToken); + } else { + final Token nlToken = new CommonToken(this.curToken); // save the current NEWLINE token + final boolean isLookingAhead = this.ffgToken.getType() == PythonLexer.WS; + if (isLookingAhead) { + this.setCurrentAndFollowingTokens(); // set the next two tokens + } + + switch (this.ffgToken.getType()) { + case PythonLexer.NEWLINE: // We're before a blank line + case PythonLexer.COMMENT: // We're before a comment + this.hideAndAddPendingToken(nlToken); + if (isLookingAhead) { + this.addPendingToken(this.curToken); // WS token + } + break; + default: + this.addPendingToken(nlToken); + if (isLookingAhead) { // We're on whitespace(s) followed by a statement + final int indentationLength = this.ffgToken.getType() == Token.EOF ? + 0 : + this.getIndentationLength(this.curToken.getText()); + + if (indentationLength != this.INVALID_LENGTH) { + this.addPendingToken(this.curToken); // WS token + this.insertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s) + } else { + this.reportError("inconsistent use of tabs and spaces in indentation"); + } + } else { // We're at a newline followed by a statement (there is no whitespace before the statement) + this.insertIndentOrDedentToken(0); // may insert DEDENT token(s) + } + } + } + } + + private void insertIndentOrDedentToken(final int indentLength) { + int prevIndentLength = this.indentLengthStack.peek(); + if (indentLength > prevIndentLength) { + this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken); + this.indentLengthStack.push(indentLength); + } else { + while (indentLength < prevIndentLength) { // more than 1 DEDENT token may be inserted to the token stream + this.indentLengthStack.pop(); + prevIndentLength = this.indentLengthStack.peek(); + if (indentLength <= prevIndentLength) { + this.createAndAddPendingToken(PythonLexer.DEDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken); + } else { + this.reportError("inconsistent dedent"); + } + } + } + } + + private void insertTrailingTokens() { + switch (this.lastPendingTokenTypeFromDefaultChannel) { + case PythonLexer.NEWLINE: + case PythonLexer.DEDENT: + break; // no trailing NEWLINE token is needed + default: + // insert an extra trailing NEWLINE token that serves as the end of the last statement + this.createAndAddPendingToken(PythonLexer.NEWLINE, Token.DEFAULT_CHANNEL, null, this.ffgToken); // ffgToken is EOF + } + this.insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed + } + + private void handleEOFtoken() { + if (this.lastPendingTokenTypeFromDefaultChannel > 0) { + // there was statement in the input (leading NEWLINE tokens are hidden) + this.insertTrailingTokens(); + } + this.addPendingToken(this.curToken); + } + + private void hideAndAddPendingToken(final Token tkn) { + CommonToken ctkn = new CommonToken(tkn); + ctkn.setChannel(Token.HIDDEN_CHANNEL); + this.addPendingToken(ctkn); + } + + private void createAndAddPendingToken(final int ttype, final int channel, final String text, Token sampleToken) { + CommonToken ctkn = new CommonToken(sampleToken); + ctkn.setType(ttype); + ctkn.setChannel(channel); + ctkn.setStopIndex(sampleToken.getStartIndex() - 1); + ctkn.setText(text == null + ? "<" + this.getVocabulary().getDisplayName(ttype) + ">" + : text); + + this.addPendingToken(ctkn); + } + + private void addPendingToken(final Token tkn) { + // save the last pending token type because the pendingTokens linked list can be empty by the nextToken() + this.previousPendingTokenType = tkn.getType(); + if (tkn.getChannel() == Token.DEFAULT_CHANNEL) { + this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType; + } + this.pendingTokens.addLast(tkn); + } + + private int getIndentationLength(final String indentText) { // the indentText may contain spaces, tabs or form feeds + final int TAB_LENGTH = 8; // the standard number of spaces to replace a tab to spaces + int length = 0; + for (char ch : indentText.toCharArray()) { + switch (ch) { + case ' ': + this.wasSpaceIndentation = true; + length += 1; + break; + case '\t': + this.wasTabIndentation = true; + length += TAB_LENGTH - (length % TAB_LENGTH); + break; + case '\f': // form feed + length = 0; + break; + } + } + + if (this.wasTabIndentation && this.wasSpaceIndentation) { + if (!(this.wasIndentationMixedWithSpacesAndTabs)) { + this.wasIndentationMixedWithSpacesAndTabs = true; + length = this.INVALID_LENGTH; // only for the first inconsistent indent + } + } + return length; + } + + private void reportLexerError(final String errMsg) { + this.getErrorListenerDispatch().syntaxError(this, this.curToken, this.curToken.getLine(), this.curToken.getCharPositionInLine(), " LEXER" + this.ERR_TXT + errMsg, null); + } + + private void reportError(final String errMsg) { + this.reportLexerError(errMsg); + + // the ERRORTOKEN will raise an error in the parser + this.createAndAddPendingToken(PythonLexer.ERRORTOKEN, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.ffgToken); + } + } + \ No newline at end of file diff --git a/python/python2_7_18/JavaScript/PythonLexerBase.js b/python/python2_7_18/JavaScript/PythonLexerBase.js index 5b1d8687ef..82b8e59536 100644 --- a/python/python2_7_18/JavaScript/PythonLexerBase.js +++ b/python/python2_7_18/JavaScript/PythonLexerBase.js @@ -27,7 +27,7 @@ THE SOFTWARE. * */ -import { Token, CommonToken, Lexer } from "antlr4"; +import { Token, Lexer } from "antlr4"; import PythonLexer from "./PythonLexer.js"; export default class PythonLexerBase extends Lexer { @@ -49,17 +49,27 @@ export default class PythonLexerBase extends Lexer { this.wasSpaceIndentation; this.wasTabIndentation; this.wasIndentationMixedWithSpacesAndTabs; - const INVALID_LENGTH = -1; - + this.curToken; // current (under processing) token this.ffgToken; // following (look ahead) token - const ERR_TXT = " ERROR: "; + this.#init(); + } + + get #INVALID_LENGTH() { return -1; } + get #ERR_TXT() { return " ERROR: "; } - this.init(); + nextToken() { // reading the input stream until a return EOF + this.#checkNextToken(); + return this.pendingTokens.shift() /* .pollFirst() */; // add the queued token to the token stream } - init() { + reset() { + this.#init(); + super.reset(); + } + + #init() { this.indentLengthStack = []; this.pendingTokens = []; this.previousPendingTokenType = 0; @@ -72,16 +82,11 @@ export default class PythonLexerBase extends Lexer { this.ffgToken = null; } - nextToken() { // reading the input stream until a return EOF - this.checkNextToken(); - return this.pendingTokens.shift() /* .pollFirst() */; // add the queued token to the token stream - } - - checkNextToken() { + #checkNextToken() { if (this.previousPendingTokenType !== Token.EOF) { - this.setCurrentAndFollowingTokens(); + this.#setCurrentAndFollowingTokens(); if (this.indentLengthStack.length === 0) { // We're at the first token - this.handleStartOfInput(); + this.#handleStartOfInput(); } switch (this.curToken.type) { @@ -89,207 +94,181 @@ export default class PythonLexerBase extends Lexer { case PythonLexer.LSQB: case PythonLexer.LBRACE: this.opened++; - this.addPendingToken(this.curToken); + this.#addPendingToken(this.curToken); break; case PythonLexer.RPAR: case PythonLexer.RSQB: case PythonLexer.RBRACE: this.opened--; - this.addPendingToken(this.curToken); + this.#addPendingToken(this.curToken); break; case PythonLexer.NEWLINE: - this.handleNEWLINEtoken(); + this.#handleNEWLINEtoken(); break; - case PythonLexer.STRING: - this.handleSTRINGtoken(); - break; - case PythonLexer.ERROR_TOKEN: - this.reportLexerError(`token recognition error at: '${this.curToken.text}'`); - this.addPendingToken(this.curToken); + case PythonLexer.ERRORTOKEN: + this.#reportLexerError(`token recognition error at: '${this.curToken.text}'`); + this.#addPendingToken(this.curToken); break; case Token.EOF: - this.handleEOFtoken(); + this.#handleEOFtoken(); break; default: - this.addPendingToken(this.curToken); + this.#addPendingToken(this.curToken); } } } - setCurrentAndFollowingTokens() { + #setCurrentAndFollowingTokens() { this.curToken = this.ffgToken == undefined ? - this.getCommonTokenByToken(super.nextToken()) : - this.getCommonTokenByToken(this.ffgToken); + super.nextToken() : + this.ffgToken; this.ffgToken = this.curToken.type === Token.EOF ? this.curToken : - this.getCommonTokenByToken(super.nextToken()); + super.nextToken(); } // initialize the _indentLengthStack // hide the leading NEWLINE token(s) // if exists, find the first statement (not NEWLINE, not EOF token) that comes from the default channel // insert a leading INDENT token if necessary - handleStartOfInput() { + #handleStartOfInput() { // initialize the stack with a default 0 indentation length this.indentLengthStack.push(0); // this will never be popped off while (this.curToken.type !== Token.EOF) { if (this.curToken.channel === Token.DEFAULT_CHANNEL) { if (this.curToken.type === PythonLexer.NEWLINE) { // all the NEWLINE tokens must be ignored before the first statement - this.hideAndAddPendingToken(this.curToken); + this.#hideAndAddPendingToken(this.curToken); } else { // We're at the first statement - this.insertLeadingIndentToken(); - return; // continue the processing of the current token with checkNextToken() + this.#insertLeadingIndentToken(); + return; // continue the processing of the current token with #checkNextToken() } } else { - this.addPendingToken(this.curToken); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token + this.#addPendingToken(this.curToken); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token } - this.setCurrentAndFollowingTokens(); - } // continue the processing of the EOF token with checkNextToken() + this.#setCurrentAndFollowingTokens(); + } // continue the processing of the EOF token with #checkNextToken() } - insertLeadingIndentToken() { + #insertLeadingIndentToken() { if (this.previousPendingTokenType === PythonLexer.WS) { let prevToken = this.pendingTokens.at(- 1) /* .peekLast() */; // WS token - if (this.getIndentationLength(prevToken.text) !== 0) { // there is an "indentation" before the first statement + if (this.#getIndentationLength(prevToken.text) !== 0) { // there is an "indentation" before the first statement const errMsg = "first statement indented"; - this.reportLexerError(errMsg); + this.#reportLexerError(errMsg); // insert an INDENT token before the first statement to raise an 'unexpected indent' error later by the parser - this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.curToken); + this.#createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, this.#ERR_TXT + errMsg, this.curToken); } } } - handleNEWLINEtoken() { + #handleNEWLINEtoken() { if (this.opened > 0) { // We're in an implicit line joining, ignore the current NEWLINE token - this.hideAndAddPendingToken(this.curToken); + this.#hideAndAddPendingToken(this.curToken); } else { - let nlToken = this.getCommonTokenByToken(this.curToken); // save the current NEWLINE token + let nlToken = this.curToken.clone(); // save the current NEWLINE token const isLookingAhead = this.ffgToken.type === PythonLexer.WS; if (isLookingAhead) { - this.setCurrentAndFollowingTokens(); // set the next two tokens + this.#setCurrentAndFollowingTokens(); // set the next two tokens } switch (this.ffgToken.type) { - case PythonLexer.NEWLINE: // We're before a blank line - case PythonLexer.COMMENT: // We're before a comment - this.hideAndAddPendingToken(nlToken); + case PythonLexer.NEWLINE: // We're before a blank line + case PythonLexer.COMMENT: // We're before a comment + this.#hideAndAddPendingToken(nlToken); if (isLookingAhead) { - this.addPendingToken(this.curToken); // WS token + this.#addPendingToken(this.curToken); // WS token } break; default: - this.addPendingToken(nlToken); + this.#addPendingToken(nlToken); if (isLookingAhead) { // We're on whitespace(s) followed by a statement const indentationLength = this.ffgToken.type === Token.EOF ? - 0 : - this.getIndentationLength(this.curToken.text); + 0 : + this.#getIndentationLength(this.curToken.text); - if (indentationLength !== this.INVALID_LENGTH) { - this.addPendingToken(this.curToken); // WS token - this.insertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s) + if (indentationLength !== this.#INVALID_LENGTH) { + this.#addPendingToken(this.curToken); // WS token + this.#insertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s) } else { - this.reportError("inconsistent use of tabs and spaces in indentation"); + this.#reportError("inconsistent use of tabs and spaces in indentation"); } } else { // We're at a newline followed by a statement (there is no whitespace before the statement) - this.insertIndentOrDedentToken(0); // may insert DEDENT token(s) + this.#insertIndentOrDedentToken(0); // may insert DEDENT token(s) } } } } - insertIndentOrDedentToken(curIndentLength) { + #insertIndentOrDedentToken(curIndentLength) { let prevIndentLength = this.indentLengthStack.at(-1) /* peek() */; if (curIndentLength > prevIndentLength) { - this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken); + this.#createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken); this.indentLengthStack.push(curIndentLength); } else { while (curIndentLength < prevIndentLength) { // more than 1 DEDENT token may be inserted to the token stream this.indentLengthStack.pop(); prevIndentLength = this.indentLengthStack.at(-1) /* peek() */; if (curIndentLength <= prevIndentLength) { - this.createAndAddPendingToken(PythonLexer.DEDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken); + this.#createAndAddPendingToken(PythonLexer.DEDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken); } else { - this.reportError("inconsistent dedent"); + this.#reportError("inconsistent dedent"); } } } } - handleSTRINGtoken() { // remove the \ escape sequences from the string literal - const line_joinFreeStringLiteral = this.curToken.text.replace(/\\(\r?\n)/g, ""); - if (this.curToken.text.length === line_joinFreeStringLiteral.length) { - this.addPendingToken(this.curToken); - } else { - let originalSTRINGtoken = this.getCommonTokenByToken(this.curToken); // backup the original token - this.curToken.text = line_joinFreeStringLiteral; - this.addPendingToken(this.curToken); // add the modified token with inline string literal - this.hideAndAddPendingToken(originalSTRINGtoken); // add the original token to the hidden channel - // this inserted hidden token allows to restore the original string literal with the \ escape sequences - } - } - - insertTrailingTokens() { + #insertTrailingTokens() { switch (this.lastPendingTokenTypeFromDefaultChannel) { case PythonLexer.NEWLINE: case PythonLexer.DEDENT: break; // no trailing NEWLINE token is needed default: // insert an extra trailing NEWLINE token that serves as the end of the last statement - this.createAndAddPendingToken(PythonLexer.NEWLINE, Token.DEFAULT_CHANNEL, null, this.ffgToken); // _ffgToken is EOF + this.#createAndAddPendingToken(PythonLexer.NEWLINE, Token.DEFAULT_CHANNEL, null, this.ffgToken); // _ffgToken is EOF } - this.insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed + this.#insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed } - handleEOFtoken() { + #handleEOFtoken() { if (this.lastPendingTokenTypeFromDefaultChannel > 0) { // there was a statement in the input (leading NEWLINE tokens are hidden) - this.insertTrailingTokens(); + this.#insertTrailingTokens(); } - this.addPendingToken(this.curToken); + this.#addPendingToken(this.curToken); } - hideAndAddPendingToken(cToken) { - cToken.channel = Token.HIDDEN_CHANNEL; - this.addPendingToken(cToken); + #hideAndAddPendingToken(ctkn) { + ctkn.channel = Token.HIDDEN_CHANNEL; + this.#addPendingToken(ctkn); } - createAndAddPendingToken(type, channel, text, baseToken) { - const cToken = this.getCommonTokenByToken(baseToken); - cToken.type = type; - cToken.channel = channel; - cToken.stop = baseToken.start - 1; - cToken.text = text == null ? + #createAndAddPendingToken(type, channel, text, sampleToken) { + const ctkn = sampleToken.clone(); + ctkn.type = type; + ctkn.channel = channel; + ctkn.stop = sampleToken.start - 1; + ctkn.text = text == null ? `<${this.getSymbolicNames()[type]}>` : text; - this.addPendingToken(cToken); + this.#addPendingToken(ctkn); } - addPendingToken(token) { + #addPendingToken(tkn) { // save the last pending token type because the _pendingTokens linked list can be empty by the nextToken() - this.previousPendingTokenType = token.type; - if (token.channel === Token.DEFAULT_CHANNEL) { + this.previousPendingTokenType = tkn.type; + if (tkn.channel === Token.DEFAULT_CHANNEL) { this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType; } - this.pendingTokens.push(token) /* .addLast(token) */; - } - - getCommonTokenByToken(oldToken) { - let commonToken = new CommonToken(oldToken.source, oldToken.type, oldToken.channel, oldToken.start, oldToken.stop); - commonToken.tokenIndex = oldToken.tokenIndex; - commonToken.line = oldToken.line; - commonToken.column = oldToken.column; - commonToken.text = oldToken.text; - return commonToken; + this.pendingTokens.push(tkn) /* .addLast(token) */; } - getIndentationLength(textWS) { // the textWS may contain spaces, tabs or form feeds + #getIndentationLength(indentText) { // the indentText may contain spaces, tabs or form feeds const TAB_LENGTH = 8; // the standard number of spaces to replace a tab to spaces let length = 0; - - for (let ch of textWS) { + for (let ch of indentText) { switch (ch) { case " ": this.wasSpaceIndentation = true; @@ -308,25 +287,20 @@ export default class PythonLexerBase extends Lexer { if (this.wasTabIndentation && this.wasSpaceIndentation) { if (!this.wasIndentationMixedWithSpacesAndTabs) { this.wasIndentationMixedWithSpacesAndTabs = true; - return this.INVALID_LENGTH; // only for the first inconsistent indent + length = this.#INVALID_LENGTH; // only for the first inconsistent indent } } return length; } - reportLexerError(errMsg) { - this.getErrorListenerDispatch().syntaxError(this, this.curToken, this.curToken.line, this.curToken.column, " LEXER" + this.ERR_TXT + errMsg, null); + #reportLexerError(errMsg) { + this.getErrorListener().syntaxError(this, this.curToken, this.curToken.line, this.curToken.column, " LEXER" + this.#ERR_TXT + errMsg, null); } - reportError(errMsg) { - this.reportLexerError(errMsg); - - // the ERROR_TOKEN will raise an error in the parser - this.createAndAddPendingToken(PythonLexer.ERROR_TOKEN, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.ffgToken); - } + #reportError(errMsg) { + this.#reportLexerError(errMsg); - reset() { - this.init(); - super.reset(); + // the ERRORTOKEN will raise an error in the parser + this.#createAndAddPendingToken(PythonLexer.ERRORTOKEN, Token.DEFAULT_CHANNEL, this.#ERR_TXT + errMsg, this.ffgToken); } } diff --git a/python/python2_7_18/Python3/PythonLexerBase.py b/python/python2_7_18/Python3/PythonLexerBase.py index dc262833cc..3abe736fdb 100644 --- a/python/python2_7_18/Python3/PythonLexerBase.py +++ b/python/python2_7_18/Python3/PythonLexerBase.py @@ -21,7 +21,6 @@ # # Developed by : Robert Einhorn -from collections import deque from typing import TextIO from antlr4 import InputStream, Lexer, Token from antlr4.Token import CommonToken @@ -33,229 +32,214 @@ def __init__(self, input: InputStream, output: TextIO = sys.stdout): super().__init__(input, output) # A stack that keeps track of the indentation lengths - self.indent_length_stack: Deque[int] + self.__indent_length_stack: list[int] # A list where tokens are waiting to be loaded into the token stream - self.pending_tokens: list[CommonToken] + self.__pending_tokens: list[CommonToken] # last pending token types - self.previous_pending_token_type: int - self.last_pending_token_type_from_default_channel: int + self.__previous_pending_token_type: int + self.__last_pending_token_type_from_default_channel: int # The amount of opened parentheses, square brackets or curly braces - self.opened: int - - self.was_space_indentation: bool - self.was_tab_indentation: bool - self.was_indentation_mixed_with_spaces_and_tabs: bool - self.INVALID_LENGTH: int - - self.cur_token: CommonToken # current (under processing) token - self.ffg_token: CommonToken # following (look ahead) token - - self.ERR_TXT: str - - self.init() - - def init(self): - self.indent_length_stack = deque() - self.pending_tokens = [] - self.previous_pending_token_type = 0 - self.last_pending_token_type_from_default_channel = 0 - self.opened = 0 - self.was_space_indentation = False - self.was_tab_indentation = False - self.was_indentation_mixed_with_spaces_and_tabs = False - self.INVALID_LENGTH = -1 - self.cur_token = None - self.ffg_token = None - self.ERR_TXT = " ERROR: " + self.__opened: int + + self.__was_space_indentation: bool + self.__was_tab_indentation: bool + self.__was_indentation_mixed_with_spaces_and_tabs: bool + + self.__cur_token: CommonToken # current (under processing) token + self.__ffg_token: CommonToken # following (look ahead) token + + self.__INVALID_LENGTH: int = -1 + self.__ERR_TXT: str = " ERROR: " + + self.__init() def nextToken(self) -> CommonToken: # reading the input stream until a return EOF - self.check_next_token() - return self.pending_tokens.pop(0) # add the queued token to the token stream - - def check_next_token(self): - if self.previous_pending_token_type != Token.EOF: - self.set_current_and_following_tokens() - if len(self.indent_length_stack) == 0: # We're at the first token - self.handle_start_of_input() - match self.cur_token.type: + self.__check_next_token() + return self.__pending_tokens.pop(0) # add the queued token to the token stream + + def reset(self) -> None: + self.__init() + super().reset() + + def __init(self) -> None: + self.__indent_length_stack = [] + self.__pending_tokens = [] + self.__previous_pending_token_type = 0 + self.__last_pending_token_type_from_default_channel = 0 + self.__opened = 0 + self.__was_space_indentation = False + self.__was_tab_indentation = False + self.__was_indentation_mixed_with_spaces_and_tabs = False + self.__cur_token = None + self.__ffg_token = None + + def __check_next_token(self) -> None: + if self.__previous_pending_token_type != Token.EOF: + self.__set_current_and_following_tokens() + if len(self.__indent_length_stack) == 0: # We're at the first token + self.__handle_start_of_input() + + match self.__cur_token.type: case self.LPAR | self.LSQB | self.LBRACE: - self.opened += 1 - self.add_pending_token(self.cur_token) + self.__opened += 1 + self.__add_pending_token(self.__cur_token) case self.RPAR | self.RSQB | self.RBRACE: - self.opened -= 1 - self.add_pending_token(self.cur_token) + self.__opened -= 1 + self.__add_pending_token(self.__cur_token) case self.NEWLINE: - self.handle_NEWLINE_token() - case self.STRING: - self.handle_STRING_token() - case self.ERROR_TOKEN: - self.report_lexer_error("token recognition error at: '" + self.cur_token.text + "'") - self.add_pending_token(self.cur_token) + self.__handle_NEWLINE_token() + case self.ERRORTOKEN: + self.__report_lexer_error("token recognition error at: '" + self.__cur_token.text + "'") + self.__add_pending_token(self.__cur_token) case Token.EOF: - self.handle_EOF_token() + self.__handle_EOF_token() case other: - self.add_pending_token(self.cur_token) + self.__add_pending_token(self.__cur_token) - def set_current_and_following_tokens(self): - self.cur_token = super().nextToken() if self.ffg_token is None else \ - self.ffg_token + def __set_current_and_following_tokens(self) -> None: + self.__cur_token = super().nextToken() if self.__ffg_token is None else \ + self.__ffg_token - self.ffg_token = self.cur_token if self.cur_token.type == Token.EOF else \ - super().nextToken() + self.__ffg_token = self.__cur_token if self.__cur_token.type == Token.EOF else \ + super().nextToken() # initialize the _indent_length_stack # hide the leading NEWLINE token(s) # if exists, find the first statement (not NEWLINE, not EOF token) that comes from the default channel # insert a leading INDENT token if necessary - def handle_start_of_input(self): + def __handle_start_of_input(self) -> None: # initialize the stack with a default 0 indentation length - self.indent_length_stack.append(0) # this will never be popped off - while self.cur_token.type != Token.EOF: - if self.cur_token.channel == Token.DEFAULT_CHANNEL: - if self.cur_token.type == self.NEWLINE: + self.__indent_length_stack.append(0) # this will never be popped off + while self.__cur_token.type != Token.EOF: + if self.__cur_token.channel == Token.DEFAULT_CHANNEL: + if self.__cur_token.type == self.NEWLINE: # all the NEWLINE tokens must be ignored before the first statement - self.hide_and_add_pending_token(self.cur_token) + self.__hide_and_add_pending_token(self.__cur_token) else: # We're at the first statement - self.insert_leading_indent_token() - return # continue the processing of the current token with check_next_token() + self.__insert_leading_indent_token() + return # continue the processing of the current token with __check_next_token() else: - self.add_pending_token(self.cur_token) # it can be WS, EXPLICIT_LINE_JOINING or COMMENT token - self.set_current_and_following_tokens() - # continue the processing of the EOF token with check_next_token() - - def insert_leading_indent_token(self): - if self.previous_pending_token_type == self.WS: - prev_token: CommonToken = self.pending_tokens[-1] # WS token - if self.get_indentation_length(prev_token.text) != 0: # there is an "indentation" before the first statement + self.__add_pending_token(self.__cur_token) # it can be WS, EXPLICIT_LINE_JOINING or COMMENT token + self.__set_current_and_following_tokens() + # continue the processing of the EOF token with __check_next_token() + + def __insert_leading_indent_token(self) -> None: + if self.__previous_pending_token_type == self.WS: + prev_token: CommonToken = self.__pending_tokens[-1] # WS token + if self.__get_indentation_length(prev_token.text) != 0: # there is an "indentation" before the first statement err_msg: str = "first statement indented" - self.report_lexer_error(err_msg) + self.__report_lexer_error(err_msg) # insert an INDENT token before the first statement to raise an 'unexpected indent' error later by the parser - self.create_and_add_pending_token(self.INDENT, Token.DEFAULT_CHANNEL, self.ERR_TXT + err_msg, self.cur_token) + self.__create_and_add_pending_token(self.INDENT, Token.DEFAULT_CHANNEL, self.__ERR_TXT + err_msg, self.__cur_token) - def handle_NEWLINE_token(self): - if self.opened > 0: # We're in an implicit line joining, ignore the current NEWLINE token - self.hide_and_add_pending_token(self.cur_token) + def __handle_NEWLINE_token(self) -> None: + if self.__opened > 0: # We're in an implicit line joining, ignore the current NEWLINE token + self.__hide_and_add_pending_token(self.__cur_token) else: - nl_token: CommonToken = self.cur_token # save the current NEWLINE token - is_looking_ahead: bool = self.ffg_token.type == self.WS + nl_token: CommonToken = self.__cur_token.clone() # save the current NEWLINE token + is_looking_ahead: bool = self.__ffg_token.type == self.WS if is_looking_ahead: - self.set_current_and_following_tokens() # set the next two tokens + self.__set_current_and_following_tokens() # set the next two tokens - match self.ffg_token.type: + match self.__ffg_token.type: case self.NEWLINE | self.COMMENT: - # We're before a blank line or a comment or a type comment - self.hide_and_add_pending_token(nl_token) # ignore the NEWLINE token + # We're before a blank line or a comment or type comment or a type ignore comment + self.__hide_and_add_pending_token(nl_token) # ignore the NEWLINE token if is_looking_ahead: - self.add_pending_token(self.cur_token) # WS token + self.__add_pending_token(self.__cur_token) # WS token case other: - self.add_pending_token(nl_token) + self.__add_pending_token(nl_token) if is_looking_ahead: # We're on a whitespace(s) followed by a statement - indentation_length: int = 0 if self.ffg_token.type == Token.EOF else \ - self.get_indentation_length(self.cur_token.text) + indentation_length: int = 0 if self.__ffg_token.type == Token.EOF else \ + self.__get_indentation_length(self.__cur_token.text) - if indentation_length != self.INVALID_LENGTH: - self.add_pending_token(self.cur_token) # WS token - self.insert_indent_or_dedent_token(indentation_length) # may insert INDENT token or DEDENT token(s) + if indentation_length != self.__INVALID_LENGTH: + self.__add_pending_token(self.__cur_token) # WS token + self.__insert_indent_or_dedent_token(indentation_length) # may insert INDENT token or DEDENT token(s) else: - self.report_error("inconsistent use of tabs and spaces in indentation") + self.__report_error("inconsistent use of tabs and spaces in indentation") else: # We're at a newline followed by a statement (there is no whitespace before the statement) - self.insert_indent_or_dedent_token(0) # may insert DEDENT token(s) + self.__insert_indent_or_dedent_token(0) # may insert DEDENT token(s) - def insert_indent_or_dedent_token(self, indent_length: int): - prev_indent_length: int = self.indent_length_stack[-1] # peek() + def __insert_indent_or_dedent_token(self, indent_length: int) -> None: + prev_indent_length: int = self.__indent_length_stack[-1] # peek() if indent_length > prev_indent_length: - self.create_and_add_pending_token(self.INDENT, Token.DEFAULT_CHANNEL, None, self.ffg_token) - self.indent_length_stack.append(indent_length) + self.__create_and_add_pending_token(self.INDENT, Token.DEFAULT_CHANNEL, None, self.__ffg_token) + self.__indent_length_stack.append(indent_length) else: while indent_length < prev_indent_length: # more than 1 DEDENT token may be inserted to the token stream - self.indent_length_stack.pop() - prev_indent_length = self.indent_length_stack[-1] # peek() + self.__indent_length_stack.pop() + prev_indent_length = self.__indent_length_stack[-1] # peek() if indent_length <= prev_indent_length: - self.create_and_add_pending_token(self.DEDENT, Token.DEFAULT_CHANNEL, None, self.ffg_token) + self.__create_and_add_pending_token(self.DEDENT, Token.DEFAULT_CHANNEL, None, self.__ffg_token) else: - self.report_error("inconsistent dedent") + self.__report_error("inconsistent dedent") - def handle_STRING_token(self): # remove the \ escape sequences from the string literal - # https://docs.python.org/3.11/reference/lexical_analysis.html#string-and-bytes-literals - line_joinFreeStringLiteral: str = re.sub(r"\\\r?\n", "", self.cur_token.text) - if len(self.cur_token.text) == len(line_joinFreeStringLiteral): - self.add_pending_token(self.cur_token) - else: - originalSTRINGtoken: CommonToken = self.cur_token.clone() # backup the original token - self.cur_token.text = line_joinFreeStringLiteral - self.add_pending_token(self.cur_token) # add the modified token with inline string literal - self.hide_and_add_pending_token(originalSTRINGtoken) # add the original token to the hidden channel - # this inserted hidden token allows to restore the original string literal with the \ escape sequences - - def insert_trailing_tokens(self): - match self.last_pending_token_type_from_default_channel: + def __insert_trailing_tokens(self) -> None: + match self.__last_pending_token_type_from_default_channel: case self.NEWLINE | self.DEDENT: pass # no trailing NEWLINE token is needed case other: # insert an extra trailing NEWLINE token that serves as the end of the last statement - self.create_and_add_pending_token(self.NEWLINE, Token.DEFAULT_CHANNEL, None, self.ffg_token) # _ffg_token is EOF - self.insert_indent_or_dedent_token(0) # Now insert as much trailing DEDENT tokens as needed + self.__create_and_add_pending_token(self.NEWLINE, Token.DEFAULT_CHANNEL, None, self.__ffg_token) # _ffg_token is EOF + self.__insert_indent_or_dedent_token(0) # Now insert as much trailing DEDENT tokens as needed - def handle_EOF_token(self): - if self.last_pending_token_type_from_default_channel > 0: + def __handle_EOF_token(self) -> None: + if self.__last_pending_token_type_from_default_channel > 0: # there was statement in the input (leading NEWLINE tokens are hidden) - self.insert_trailing_tokens() - self.add_pending_token(self.cur_token) + self.__insert_trailing_tokens() + self.__add_pending_token(self.__cur_token) - def hide_and_add_pending_token(self, cToken: CommonToken): - cToken.channel = Token.HIDDEN_CHANNEL - self.add_pending_token(cToken) + def __hide_and_add_pending_token(self, ctkn: CommonToken) -> None: + ctkn.channel = Token.HIDDEN_CHANNEL + self.__add_pending_token(ctkn) - def create_and_add_pending_token(self, type: int, channel: int, text: str, base_token: CommonToken): - cToken: CommonToken = base_token.clone() - cToken.type = type - cToken.channel = channel - cToken.stop = base_token.start - 1 - cToken.text = "<" + self.symbolicNames[type] + ">" if text is None else \ - text + def __create_and_add_pending_token(self, ttype: int, channel: int, text: str, sample_token: CommonToken) -> None: + ctkn: CommonToken = sample_token.clone() + ctkn.type = ttype + ctkn.channel = channel + ctkn.stop = sample_token.start - 1 + ctkn.text = "<" + self.symbolicNames[ttype] + ">" if text is None else \ + text - self.add_pending_token(cToken) + self.__add_pending_token(ctkn) - def add_pending_token(self, token: CommonToken): + def __add_pending_token(self, ctkn: CommonToken) -> None: # save the last pending token type because the _pending_tokens list can be empty by the nextToken() - self.previous_pending_token_type = token.type - if token.channel == Token.DEFAULT_CHANNEL: - self.last_pending_token_type_from_default_channel = self.previous_pending_token_type - self.pending_tokens.append(token) + self.__previous_pending_token_type = ctkn.type + if ctkn.channel == Token.DEFAULT_CHANNEL: + self.__last_pending_token_type_from_default_channel = self.__previous_pending_token_type + self.__pending_tokens.append(ctkn) - def get_indentation_length(self, textWS: str) -> int: # the textWS may contain spaces, tabs or form feeds + def __get_indentation_length(self, indentText: str) -> int: # the indentText may contain spaces, tabs or form feeds TAB_LENGTH: int = 8 # the standard number of spaces to replace a tab to spaces length: int = 0 ch: str - for ch in textWS: + for ch in indentText: match ch: case ' ': - self.was_space_indentation = True + self.__was_space_indentation = True length += 1 case '\t': - self.was_tab_indentation = True + self.__was_tab_indentation = True length += TAB_LENGTH - (length % TAB_LENGTH) case '\f': # form feed length = 0 - if self.was_tab_indentation and self.was_space_indentation: - if not self.was_indentation_mixed_with_spaces_and_tabs: - self.was_indentation_mixed_with_spaces_and_tabs = True - return self.INVALID_LENGTH # only for the first inconsistent indent + if self.__was_tab_indentation and self.__was_space_indentation: + if not self.__was_indentation_mixed_with_spaces_and_tabs: + self.__was_indentation_mixed_with_spaces_and_tabs = True + length = self.__INVALID_LENGTH # only for the first inconsistent indent return length - def report_lexer_error(self, err_msg): - self.getErrorListenerDispatch().syntaxError(self, self.cur_token, self.cur_token.line, self.cur_token.column, " LEXER" + self.ERR_TXT + err_msg, None) + def __report_lexer_error(self, err_msg: str) -> None: + self.getErrorListenerDispatch().syntaxError(self, self.__cur_token, self.__cur_token.line, self.__cur_token.column, " LEXER" + self.__ERR_TXT + err_msg, None) - def report_error(self, err_msg): - self.report_lexer_error(err_msg) + def __report_error(self, err_msg: str) -> None: + self.__report_lexer_error(err_msg) - # the ERROR_TOKEN will raise an error in the parser - self.create_and_add_pending_token(self.ERROR_TOKEN, Token.DEFAULT_CHANNEL, self.ERR_TXT + err_msg, self.ffg_token) - - def reset(self): - self.init() - super().reset() + # the ERRORTOKEN will raise an error in the parser + self.__create_and_add_pending_token(self.ERRORTOKEN, Token.DEFAULT_CHANNEL, self.__ERR_TXT + err_msg, self.__ffg_token) diff --git a/python/python2_7_18/PythonLexer.g4 b/python/python2_7_18/PythonLexer.g4 index 6a045a38a6..5146572012 100644 --- a/python/python2_7_18/PythonLexer.g4 +++ b/python/python2_7_18/PythonLexer.g4 @@ -131,7 +131,7 @@ NUMBER STRING : STRING_LITERAL; // https://docs.python.org/2.7/reference/lexical_analysis.html#physical-lines -NEWLINE : OS_INDEPENDENT_NL; +NEWLINE : '\r'? '\n'; // Unix, Windows // https://docs.python.org/2.7/reference/lexical_analysis.html#comments COMMENT : '#' ~[\r\n]* -> channel(HIDDEN); @@ -142,7 +142,7 @@ WS : [ \t\f]+ -> channel(HIDDEN); // https://docs.python.org/2.7/reference/lexical_analysis.html#explicit-line-joining EXPLICIT_LINE_JOINING : '\\' NEWLINE -> channel(HIDDEN); -ERROR_TOKEN : . ; // catch unrecognized characters and redirect these errors to the parser +ERRORTOKEN : . ; // catch unrecognized characters and redirect these errors to the parser /* @@ -173,10 +173,10 @@ fragment LONG_STRING_ITEM : LONG_STRING_CHAR | ESCAPE_SEQ; fragment SHORT_STRING_CHAR_NO_SINGLE_QUOTE : ~[\\\r\n']; // fragment SHORT_STRING_CHAR_NO_DOUBLE_QUOTE : ~[\\\r\n"]; // fragment LONG_STRING_CHAR : ~'\\'; // -fragment ESCAPE_SEQ - : '\\' OS_INDEPENDENT_NL // \ escape sequence - | '\\' [\u0000-\u007F] // "\" - ; // the \ (not \n) escape sequences will be removed from the string literals by the PythonLexerBase class +fragment ESCAPE_SEQ // https://docs.python.org/2.7/reference/lexical_analysis.html#string-literals + : '\\' '\r' '\n' // for the two-character Windows line break: \ escape sequence (string literal line continuation) + | '\\' [\u0000-\u007F] // "\" + ; // https://docs.python.org/2.7/reference/lexical_analysis.html#integer-and-long-integer-literals fragment LONG_INTEGER : INTEGER ('l' | 'L'); @@ -201,9 +201,6 @@ fragment EXPONENT : ('e' | 'E') ('+' | '-')? DIGIT+; // https://docs.python.org/2.7/reference/lexical_analysis.html#imaginary-literals fragment IMAG_NUMBER : (FLOAT_NUMBER | INT_PART) ('j' | 'J'); -// https://docs.python.org/2.7/reference/lexical_analysis.html#physical-lines -fragment OS_INDEPENDENT_NL : '\r'? '\n'; // Unix, Windows - // https://docs.python.org/2.7/reference/lexical_analysis.html#identifiers fragment IDENTIFIER : (LETTER | '_') (LETTER | DIGIT | '_')*; fragment LETTER : LOWERCASE | UPPERCASE; diff --git a/python/python2_7_18/TypeScript/PythonLexerBase.ts b/python/python2_7_18/TypeScript/PythonLexerBase.ts new file mode 100644 index 0000000000..0c8ad608b1 --- /dev/null +++ b/python/python2_7_18/TypeScript/PythonLexerBase.ts @@ -0,0 +1,306 @@ +/* +The MIT License (MIT) +Copyright (c) 2021 Robert Einhorn + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + */ + +/* + * + * Project : Python Indent/Dedent handler for ANTLR4 grammars + * + * Developed by : Robert Einhorn, robert.einhorn.hu@gmail.com + * + */ + +import { CharStream, Token, Lexer } from "antlr4"; +import PythonLexer from "./PythonLexer"; +import * as Collections from "typescript-collections"; + +export default abstract class PythonLexerBase extends Lexer { + // A stack that keeps track of the indentation lengths + private indentLengthStack!: Collections.Stack; + // A list where tokens are waiting to be loaded into the token stream + private pendingTokens!: Array; + + // last pending token types + private previousPendingTokenType!: number; + private lastPendingTokenTypeFromDefaultChannel!: number; + + // The amount of opened parentheses, square brackets or curly braces + private opened!: number; + + private wasSpaceIndentation!: boolean; + private wasTabIndentation!: boolean; + private wasIndentationMixedWithSpacesAndTabs!: boolean; + + private curToken: Token | undefined; // current (under processing) token + private ffgToken: Token | undefined; // following (look ahead) token + + private readonly INVALID_LENGTH: number = -1; + private readonly ERR_TXT: string = " ERROR: "; + + protected constructor(input: CharStream) { + super(input); + this.init(); + } + + public nextToken(): Token { // reading the input stream until a return EOF + this.checkNextToken(); + return this.pendingTokens.shift()! /* .pollFirst() */; // add the queued token to the token stream + } + + public reset(): void { + this.init(); + super.reset(); + } + + private init(): void { + this.indentLengthStack = new Collections.Stack(); + this.pendingTokens = []; + this.previousPendingTokenType = 0; + this.lastPendingTokenTypeFromDefaultChannel = 0; + this.opened = 0; + this.wasSpaceIndentation = false; + this.wasTabIndentation = false; + this.wasIndentationMixedWithSpacesAndTabs = false; + this.curToken = undefined; + this.ffgToken = undefined; + } + + private checkNextToken(): void { + if (this.previousPendingTokenType !== PythonLexer.EOF) { + this.setCurrentAndFollowingTokens(); + if (this.indentLengthStack.isEmpty()) { // We're at the first token + this.handleStartOfInput(); + } + + switch (this.curToken!.type) { + case PythonLexer.LPAR: + case PythonLexer.LSQB: + case PythonLexer.LBRACE: + this.opened++; + this.addPendingToken(this.curToken!); + break; + case PythonLexer.RPAR: + case PythonLexer.RSQB: + case PythonLexer.RBRACE: + this.opened--; + this.addPendingToken(this.curToken!); + break; + case PythonLexer.NEWLINE: + this.handleNEWLINEtoken(); + break; + case PythonLexer.ERRORTOKEN: + this.reportLexerError(`token recognition error at: '${this.curToken!.text}'`); + this.addPendingToken(this.curToken!); + break; + case PythonLexer.EOF: + this.handleEOFtoken(); + break; + default: + this.addPendingToken(this.curToken!); + } + } + } + + private setCurrentAndFollowingTokens(): void { + this.curToken = this.ffgToken == undefined + ? super.nextToken() + : this.ffgToken; + + this.ffgToken = this.curToken.type === PythonLexer.EOF + ? this.curToken + : super.nextToken(); + } + + // initialize the indentLengthStack + // hide the leading NEWLINE token(s) + // if exists, find the first statement (not NEWLINE, not EOF token) that comes from the default channel + // insert a leading INDENT token if necessary + private handleStartOfInput(): void { + // initialize the stack with a default 0 indentation length + this.indentLengthStack.push(0); // this will never be popped off + while (this.curToken!.type !== PythonLexer.EOF) { + if (this.curToken!.channel === Token.DEFAULT_CHANNEL) { + if (this.curToken!.type === PythonLexer.NEWLINE) { + // all the NEWLINE tokens must be ignored before the first statement + this.hideAndAddPendingToken(this.curToken!); + } else { // We're at the first statement + this.insertLeadingIndentToken(); + return; // continue the processing of the current token with checkNextToken() + } + } else { + this.addPendingToken(this.curToken!); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token + } + this.setCurrentAndFollowingTokens(); + } // continue the processing of the EOF token with checkNextToken() + } + + private insertLeadingIndentToken(): void { + if (this.previousPendingTokenType === PythonLexer.WS) { + const prevToken: Token = this.pendingTokens[this.pendingTokens.length - 1] /* .peekLast() */; // WS token + if (this.getIndentationLength(prevToken.text) !== 0) { // there is an "indentation" before the first statement + const errMsg: string = "first statement indented"; + this.reportLexerError(errMsg); + // insert an INDENT token before the first statement to raise an 'unexpected indent' error later by the parser + this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.curToken!); + } + } + } + + private handleNEWLINEtoken(): void { + if (this.opened > 0) { // We're in an implicit line joining, ignore the current NEWLINE token + this.hideAndAddPendingToken(this.curToken!); + } else { + const nlToken: Token = this.curToken?.clone()!; // save the current NEWLINE token + const isLookingAhead: boolean = this.ffgToken!.type === PythonLexer.WS; + if (isLookingAhead) { + this.setCurrentAndFollowingTokens(); // set the next two tokens + } + + switch (this.ffgToken!.type) { + case PythonLexer.NEWLINE: // We're before a blank line + case PythonLexer.COMMENT: // We're before a comment + this.hideAndAddPendingToken(nlToken); + if (isLookingAhead) { + this.addPendingToken(this.curToken!); // WS token + } + break; + default: + this.addPendingToken(nlToken); + if (isLookingAhead) { // We're on whitespace(s) followed by a statement + const indentationLength: number = this.ffgToken!.type === PythonLexer.EOF ? + 0 : + this.getIndentationLength(this.curToken!.text); + + if (indentationLength !== this.INVALID_LENGTH) { + this.addPendingToken(this.curToken!); // WS token + this.insertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s) + } else { + this.reportError("inconsistent use of tabs and spaces in indentation"); + } + } else { // We're at a newline followed by a statement (there is no whitespace before the statement) + this.insertIndentOrDedentToken(0); // may insert DEDENT token(s) + } + } + } + } + + private insertIndentOrDedentToken(indentLength: number): void { + let prevIndentLength: number = this.indentLengthStack.peek()!; + if (indentLength > prevIndentLength) { + this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken!); + this.indentLengthStack.push(indentLength); + } else { + while (indentLength < prevIndentLength) { // more than 1 DEDENT token may be inserted to the token stream + this.indentLengthStack.pop(); + prevIndentLength = this.indentLengthStack.peek()!; + if (indentLength <= prevIndentLength) { + this.createAndAddPendingToken(PythonLexer.DEDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken!); + } else { + this.reportError("inconsistent dedent"); + } + } + } + } + + private insertTrailingTokens(): void { + switch (this.lastPendingTokenTypeFromDefaultChannel) { + case PythonLexer.NEWLINE: + case PythonLexer.DEDENT: + break; // no trailing NEWLINE token is needed + default: + // insert an extra trailing NEWLINE token that serves as the end of the last statement + this.createAndAddPendingToken(PythonLexer.NEWLINE, Token.DEFAULT_CHANNEL, null, this.ffgToken!); // ffgToken is EOF + } + this.insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed + } + + private handleEOFtoken(): void { + if (this.lastPendingTokenTypeFromDefaultChannel > 0) { + // there was a statement in the input (leading NEWLINE tokens are hidden) + this.insertTrailingTokens(); + } + this.addPendingToken(this.curToken!); + } + + private hideAndAddPendingToken(tkn: Token): void { + tkn.channel = Token.HIDDEN_CHANNEL; + this.addPendingToken(tkn); + } + + private createAndAddPendingToken(type: number, channel: number, text: string | null, sampleToken: Token): void { + const tkn: Token = sampleToken.clone(); + tkn.type = type; + tkn.channel = channel; + tkn.stop = sampleToken.start - 1; + tkn.text = text == null ? + `<${this.getSymbolicNames()[type]}>` : + text; + + this.addPendingToken(tkn); + } + + private addPendingToken(tkn: Token): void { + // save the last pending token type because the pendingTokens linked list can be empty by the nextToken() + this.previousPendingTokenType = tkn.type; + if (tkn.channel === Token.DEFAULT_CHANNEL) { + this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType; + } + this.pendingTokens.push(tkn) /* .addLast(token) */; + } + + private getIndentationLength(indentText: string): number { // the indentText may contain spaces, tabs or form feeds + const TAB_LENGTH: number = 8; // the standard number of spaces to replace a tab to spaces + let length: number = 0; + for (let ch of indentText) { + switch (ch) { + case " ": + this.wasSpaceIndentation = true; + length += 1; + break; + case "\t": + this.wasTabIndentation = true; + length += TAB_LENGTH - (length % TAB_LENGTH); + break; + case "\f": // form feed + length = 0; + break; + } + } + + if (this.wasTabIndentation && this.wasSpaceIndentation) { + if (!this.wasIndentationMixedWithSpacesAndTabs) { + this.wasIndentationMixedWithSpacesAndTabs = true; + length = this.INVALID_LENGTH; // only for the first inconsistent indent + } + } + return length; + } + + private reportLexerError(errMsg: string): void { + this.getErrorListener().syntaxError(this, 0 /* this.curToken */, this.curToken!.line, this.curToken!.column, " LEXER" + this.ERR_TXT + errMsg, undefined); + } + + private reportError(errMsg: string): void { + this.reportLexerError(errMsg); + + // the ERRORTOKEN will raise an error in the parser + this.createAndAddPendingToken(PythonLexer.ERRORTOKEN, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.ffgToken!); + } +} diff --git a/python/python2_7_18/changes.txt b/python/python2_7_18/changes.txt new file mode 100644 index 0000000000..e945e969ee --- /dev/null +++ b/python/python2_7_18/changes.txt @@ -0,0 +1,4 @@ +Szept 05, 2024 +-------------- +Line continuation for string literals (backslash followed by a newline) is no longer resolved. +(backslash+newline is no longer removed from string literals) diff --git a/python/python2_7_18/desc.xml b/python/python2_7_18/desc.xml index f6ddd173f0..8aa6fdea92 100644 --- a/python/python2_7_18/desc.xml +++ b/python/python2_7_18/desc.xml @@ -1,9 +1,9 @@ - ^4.13.1 - CSharp;Java;Python3;JavaScript + ^4.13.2 + CSharp;Java;Python3;JavaScript;TypeScript - CSharp;Java;Python3;JavaScript + CSharp;Java;Python3;JavaScript;TypeScript file_input examples diff --git a/python/python2_7_18/tests/test_error_first_statement_indented.py b/python/python2_7_18/tests/test_error_first_statement_indented.py index dc70cc8572..39431ac786 100644 --- a/python/python2_7_18/tests/test_error_first_statement_indented.py +++ b/python/python2_7_18/tests/test_error_first_statement_indented.py @@ -4,7 +4,7 @@ # EXPECTATIONS: # - inserted leading INDENT token # - hidden NEWLINE tokens (channel=1) before the first statement -# - lexer error message: "line 10:3 first statement indented" +# - lexer error message: "line 10:3 LEXER ERROR: first statement indented" i = 1 # first statement begins with space diff --git a/python/python2_7_18/tests/test_error_inconsistent_dedent.py b/python/python2_7_18/tests/test_error_inconsistent_dedent.py index 0a74fde76a..660f59ff65 100644 --- a/python/python2_7_18/tests/test_error_inconsistent_dedent.py +++ b/python/python2_7_18/tests/test_error_inconsistent_dedent.py @@ -3,7 +3,7 @@ # # EXPECTATIONS: # - inserted ERROR_TOKEN instead of the DEDENT token -# - lexer error message: "line 10:0 inconsistent dedent" +# - lexer error message: "line 10:0 LEXER ERROR: inconsistent dedent" if True: i = 0 diff --git a/python/python2_7_18/tests/test_error_tab_and_space_in_indentation.py b/python/python2_7_18/tests/test_error_tab_and_space_in_indentation.py index 493933be68..7d77a9bc0e 100644 --- a/python/python2_7_18/tests/test_error_tab_and_space_in_indentation.py +++ b/python/python2_7_18/tests/test_error_tab_and_space_in_indentation.py @@ -3,7 +3,7 @@ # # EXPECTATIONS: # - inserted ERROR_TOKEN instead of the WS token -# - lexer error message: "line 11:0 inconsistent use of tabs and spaces in indentation" +# - lexer error message: "line 11:0 LEXER ERROR: inconsistent use of tabs and spaces in indentation" if True: i = 0 # indented by spaces diff --git a/python/python2_7_18/tests/test_error_unexpected_indent.py b/python/python2_7_18/tests/test_error_unexpected_indent.py index 9d6bbd3f1f..9fca02bf5d 100644 --- a/python/python2_7_18/tests/test_error_unexpected_indent.py +++ b/python/python2_7_18/tests/test_error_unexpected_indent.py @@ -2,7 +2,7 @@ # grun Python file_input -tokens test_error_unexpected_indent.py # # EXPECTATION: -# - parser error message: "line 9:7 extraneous input '' ..." +# - parser error message: "line 9:7 mismatched input '' ..." if True: i = 0 diff --git a/python/python2_7_18/tests/test_explicit_line_joining.py b/python/python2_7_18/tests/test_explicit_line_joining.py index 011ee61e4b..55be1bd964 100644 --- a/python/python2_7_18/tests/test_explicit_line_joining.py +++ b/python/python2_7_18/tests/test_explicit_line_joining.py @@ -2,7 +2,7 @@ # grun Python file_input -tokens test_explicit_line_joining.py # # EXPECTATIONS: -# - hiden (channel=1) LINE_JOINING token +# - hiden (channel=1) EXPLICIT_LINE_JOINING token # - no error message i = 1 \ diff --git a/python/python2_7_18/tests/test_hidden_NEWLINE_before_comment.py b/python/python2_7_18/tests/test_hidden_NEWLINE_before_comment.py index d080bc16fb..9db3798954 100644 --- a/python/python2_7_18/tests/test_hidden_NEWLINE_before_comment.py +++ b/python/python2_7_18/tests/test_hidden_NEWLINE_before_comment.py @@ -6,6 +6,6 @@ def inc(value): # grun Python file_input -tokens test_hidden_NEWLINE_before_comment.py # # EXPECTATIONS: -# - hidden NEWLINE tokens (channel=1) before a COMMENT (or a TYPE_COMMENT) token +# - hidden NEWLINE tokens (channel=1) before a COMMENT token # - hidden NEWLINE token (channel=1) before the blank line # - no error message diff --git a/python/python2_7_18/tests/test_string_literal_with_newline_escape_sequence.py b/python/python2_7_18/tests/test_string_literal_with_newline_escape_sequence.py deleted file mode 100644 index f14d73cb74..0000000000 --- a/python/python2_7_18/tests/test_string_literal_with_newline_escape_sequence.py +++ /dev/null @@ -1,10 +0,0 @@ -# COMMAND LINE: -# grun Python file_input -tokens test_string_literal_with_newline_escape_sequence.py -# -# EXPECTATIONS: -# - removed \ escape sequence from the STRING token -# - inserted hidden token (channel=1) with the original string literal -# - no error message - -s = 'This string will not include \ -backslashes or newline characters.' diff --git a/python/python3_12_1/CSharp/PythonLexerBase.cs b/python/python3_12_1/CSharp/PythonLexerBase.cs index aedd7346ed..f67f3a1c62 100644 --- a/python/python3_12_1/CSharp/PythonLexerBase.cs +++ b/python/python3_12_1/CSharp/PythonLexerBase.cs @@ -37,6 +37,7 @@ public abstract class PythonLexerBase : Lexer private Stack indentLengthStack; // A list where tokens are waiting to be loaded into the token stream private LinkedList pendingTokens; + // last pending token types private int previousPendingTokenType; private int lastPendingTokenTypeFromDefaultChannel; @@ -49,11 +50,11 @@ public abstract class PythonLexerBase : Lexer private bool wasSpaceIndentation; private bool wasTabIndentation; private bool wasIndentationMixedWithSpacesAndTabs; - private const int INVALID_LENGTH = -1; - private CommonToken curToken; // current (under processing) token - private IToken ffgToken; // following (look ahead) token + private IToken curToken; // current (under processing) token + private IToken ffgToken; // following (look ahead) token + private const int INVALID_LENGTH = -1; private const string ERR_TXT = " ERROR: "; protected PythonLexerBase(ICharStream input) : base(input) @@ -66,6 +67,20 @@ protected PythonLexerBase(ICharStream input, TextWriter output, TextWriter error this.Init(); } + public override IToken NextToken() // reading the input stream until a return EOF + { + this.CheckNextToken(); + IToken firstPendingToken = this.pendingTokens.First.Value; + this.pendingTokens.RemoveFirst(); + return firstPendingToken; // add the queued token to the token stream + } + + public override void Reset() + { + this.Init(); + base.Reset(); + } + private void Init() { this.indentLengthStack = new Stack(); @@ -81,14 +96,6 @@ private void Init() this.ffgToken = null!; } - public override IToken NextToken() // reading the input stream until a return EOF - { - this.CheckNextToken(); - IToken firstPendingToken = this.pendingTokens.First.Value; - this.pendingTokens.RemoveFirst(); - return firstPendingToken; // add the queued token to the token stream - } - private void CheckNextToken() { if (this.previousPendingTokenType != TokenConstants.EOF) @@ -116,13 +123,10 @@ private void CheckNextToken() case PythonLexer.NEWLINE: this.HandleNEWLINEtoken(); break; - case PythonLexer.STRING: - this.HandleSTRINGtoken(); - break; case PythonLexer.FSTRING_MIDDLE: this.HandleFSTRING_MIDDLE_token(); break; - case PythonLexer.ERROR_TOKEN: + case PythonLexer.ERRORTOKEN: this.ReportLexerError("token recognition error at: '" + this.curToken.Text + "'"); this.AddPendingToken(this.curToken); break; @@ -140,14 +144,14 @@ private void CheckNextToken() private void SetCurrentAndFollowingTokens() { this.curToken = this.ffgToken == null ? - new CommonToken(base.NextToken()) : - new CommonToken(this.ffgToken); + base.NextToken() : + this.ffgToken; this.HandleFStringLexerModes(); this.ffgToken = this.curToken.Type == TokenConstants.EOF ? - this.curToken : - base.NextToken(); + this.curToken : + base.NextToken(); } // initialize the _indentLengths @@ -205,7 +209,7 @@ private void HandleNEWLINEtoken() } else { - CommonToken nlToken = new CommonToken(this.curToken); // save the current NEWLINE token + IToken nlToken = new CommonToken(this.curToken); // save the current NEWLINE token bool isLookingAhead = this.ffgToken.Type == PythonLexer.WS; if (isLookingAhead) { @@ -214,13 +218,12 @@ private void HandleNEWLINEtoken() switch (this.ffgToken.Type) { - case PythonLexer.NEWLINE: // We're before a blank line - case PythonLexer.COMMENT: // We're before a comment - case PythonLexer.TYPE_COMMENT: // We're before a type comment + case PythonLexer.NEWLINE: // We're before a blank line + case PythonLexer.COMMENT: // We're before a comment this.HideAndAddPendingToken(nlToken); if (isLookingAhead) { - this.AddPendingToken(this.curToken); // WS token + this.AddPendingToken(this.curToken); // WS token } break; default: @@ -253,7 +256,6 @@ private void HandleNEWLINEtoken() private void InsertIndentOrDedentToken(int indentLength) { - //*** https://docs.python.org/3/reference/lexical_analysis.html#indentation int prevIndentLength = this.indentLengthStack.Peek(); if (indentLength > prevIndentLength) { @@ -278,25 +280,6 @@ private void InsertIndentOrDedentToken(int indentLength) } } - private void HandleSTRINGtoken() - { - // remove the \ escape sequences from the string literal - // https://docs.python.org/3.11/reference/lexical_analysis.html#string-and-bytes-literals - string line_joinFreeStringLiteral = Regex.Replace(this.curToken.Text, @"\\\r?\n", ""); - if (this.curToken.Text.Length == line_joinFreeStringLiteral.Length) - { - this.AddPendingToken(this.curToken); - } - else - { - CommonToken originalSTRINGtoken = new CommonToken(this.curToken); // backup the original token - this.curToken.Text = line_joinFreeStringLiteral; - this.AddPendingToken(this.curToken); // add the modified token with inline string literal - this.HideAndAddPendingToken(originalSTRINGtoken); // add the original token with a hidden channel - // this inserted hidden token allows to restore the original string literal with the \ escape sequences - } - } - private void HandleFSTRING_MIDDLE_token() // replace the double braces '{{' or '}}' to single braces and hide the second braces { string fsMid = this.curToken.Text; @@ -325,7 +308,7 @@ private void HandleFStringLexerModes() // https://peps.python.org/pep-0498/#spe switch (this.curToken.Type) { case PythonLexer.LBRACE: - this.PushMode(PythonLexer.DEFAULT_MODE); + this.PushMode(Lexer.DEFAULT_MODE); this.paren_or_bracket_openedStack.Push(0); break; case PythonLexer.LPAR: @@ -358,7 +341,7 @@ private void HandleFStringLexerModes() // https://peps.python.org/pep-0498/#spe case PythonLexer.RBRACE: switch (CurrentMode) { - case PythonLexer.DEFAULT_MODE: + case Lexer.DEFAULT_MODE: case PythonLexer.SINGLE_QUOTE_FORMAT_SPECIFICATION_MODE: case PythonLexer.DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE: this.PopMode(); @@ -412,42 +395,43 @@ private void HandleEOFtoken() this.AddPendingToken(this.curToken); } - private void HideAndAddPendingToken(CommonToken cToken) + private void HideAndAddPendingToken(IToken tkn) { - cToken.Channel = TokenConstants.HiddenChannel; - this.AddPendingToken(cToken); + CommonToken ctkn = new CommonToken(tkn); + ctkn.Channel = TokenConstants.HiddenChannel; + this.AddPendingToken(ctkn); } - private void CreateAndAddPendingToken(int type, int channel, string text, IToken baseToken) + private void CreateAndAddPendingToken(int ttype, int channel, string text, IToken sampleToken) { - CommonToken cToken = new CommonToken(baseToken); - cToken.Type = type; - cToken.Channel = channel; - cToken.StopIndex = baseToken.StartIndex - 1; + CommonToken ctkn = new CommonToken(sampleToken); + ctkn.Type = ttype; + ctkn.Channel = channel; + ctkn.StopIndex = sampleToken.StartIndex - 1; - cToken.Text = text == null - ? "<" + Vocabulary.GetSymbolicName(type) + ">" + ctkn.Text = text == null + ? "<" + Vocabulary.GetSymbolicName(ttype) + ">" : text; - this.AddPendingToken(cToken); + this.AddPendingToken(ctkn); } - private void AddPendingToken(IToken token) + private void AddPendingToken(IToken tkn) { // save the last pending token type because the pendingTokens linked list can be empty by the nextToken() - this.previousPendingTokenType = token.Type; - if (token.Channel == TokenConstants.DefaultChannel) + this.previousPendingTokenType = tkn.Type; + if (tkn.Channel == TokenConstants.DefaultChannel) { this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType; } - this.pendingTokens.AddLast(token); + this.pendingTokens.AddLast(tkn); } - private int GetIndentationLength(string textWS) // the textWS may contain spaces, tabs or form feeds + private int GetIndentationLength(string indentText) // the indentText may contain spaces, tabs or form feeds { const int TAB_LENGTH = 8; // the standard number of spaces to replace a tab with spaces int length = 0; - foreach (char ch in textWS) + foreach (char ch in indentText) { switch (ch) { @@ -470,7 +454,7 @@ private int GetIndentationLength(string textWS) // the textWS may contain spaces if (!this.wasIndentationMixedWithSpacesAndTabs) { this.wasIndentationMixedWithSpacesAndTabs = true; - return PythonLexerBase.INVALID_LENGTH; // only for the first inconsistent indent + length = PythonLexerBase.INVALID_LENGTH; // only for the first inconsistent indent } } return length; @@ -485,13 +469,7 @@ private void ReportError(string errMsg) { this.ReportLexerError(errMsg); - // the ERROR_TOKEN will raise an error in the parser - this.CreateAndAddPendingToken(PythonLexer.ERROR_TOKEN, TokenConstants.DefaultChannel, PythonLexerBase.ERR_TXT + errMsg, this.ffgToken); - } - - public override void Reset() - { - this.Init(); - base.Reset(); + // the ERRORTOKEN will raise an error in the parser + this.CreateAndAddPendingToken(PythonLexer.ERRORTOKEN, TokenConstants.DefaultChannel, PythonLexerBase.ERR_TXT + errMsg, this.ffgToken); } } diff --git a/python/python3_12_1/Java/PythonLexerBase.java b/python/python3_12_1/Java/PythonLexerBase.java index 2f9490617e..7e4f059d7d 100644 --- a/python/python3_12_1/Java/PythonLexerBase.java +++ b/python/python3_12_1/Java/PythonLexerBase.java @@ -49,11 +49,11 @@ public abstract class PythonLexerBase extends Lexer { private boolean wasSpaceIndentation; private boolean wasTabIndentation; private boolean wasIndentationMixedWithSpacesAndTabs; - private final int INVALID_LENGTH = -1; - private CommonToken curToken; // current (under processing) token + private Token curToken; // current (under processing) token private Token ffgToken; // following (look ahead) token + private final int INVALID_LENGTH = -1; private final String ERR_TXT = " ERROR: "; protected PythonLexerBase(CharStream input) { @@ -61,6 +61,18 @@ protected PythonLexerBase(CharStream input) { this.init(); } + @Override + public Token nextToken() { // reading the input stream until a return EOF + this.checkNextToken(); + return this.pendingTokens.pollFirst(); // add the queued token to the token stream + } + + @Override + public void reset() { + this.init(); + super.reset(); + } + private void init() { this.indentLengthStack = new ArrayDeque<>(); this.pendingTokens = new LinkedList<>(); @@ -75,12 +87,6 @@ private void init() { this.ffgToken = null; } - @Override - public Token nextToken() { // reading the input stream until a return EOF - this.checkNextToken(); - return this.pendingTokens.pollFirst(); // add the queued token to the token stream - } - private void checkNextToken() { if (this.previousPendingTokenType != Token.EOF) { this.setCurrentAndFollowingTokens(); @@ -104,13 +110,10 @@ private void checkNextToken() { case PythonLexer.NEWLINE: this.handleNEWLINEtoken(); break; - case PythonLexer.STRING: - this.handleSTRINGtoken(); - break; case PythonLexer.FSTRING_MIDDLE: this.handleFSTRING_MIDDLE_token(); break; - case PythonLexer.ERROR_TOKEN: + case PythonLexer.ERRORTOKEN: this.reportLexerError("token recognition error at: '" + this.curToken.getText() + "'"); this.addPendingToken(this.curToken); break; @@ -126,8 +129,8 @@ private void checkNextToken() { private void setCurrentAndFollowingTokens() { this.curToken = this.ffgToken == null ? - new CommonToken(super.nextToken()) : - new CommonToken(this.ffgToken); + super.nextToken() : + this.ffgToken; this.handleFStringLexerModes(); @@ -156,7 +159,8 @@ private void handleStartOfInput() { this.addPendingToken(this.curToken); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token } this.setCurrentAndFollowingTokens(); - } // continue the processing of the EOF token with checkNextToken() + } + // continue the processing of the EOF token with checkNextToken() } private void insertLeadingIndentToken() { @@ -175,19 +179,18 @@ private void handleNEWLINEtoken() { if (this.opened > 0) { // We're in an implicit line joining, ignore the current NEWLINE token this.hideAndAddPendingToken(this.curToken); } else { - CommonToken nlToken = new CommonToken(this.curToken); // save the current NEWLINE token + final Token nlToken = new CommonToken(this.curToken); // save the current NEWLINE token final boolean isLookingAhead = this.ffgToken.getType() == PythonLexer.WS; if (isLookingAhead) { this.setCurrentAndFollowingTokens(); // set the next two tokens } switch (this.ffgToken.getType()) { - case PythonLexer.NEWLINE: // We're before a blank line - case PythonLexer.COMMENT: // We're before a comment - case PythonLexer.TYPE_COMMENT: // We're before a type comment + case PythonLexer.NEWLINE: // We're before a blank line + case PythonLexer.COMMENT: // We're before a comment this.hideAndAddPendingToken(nlToken); if (isLookingAhead) { - this.addPendingToken(this.curToken); // WS token + this.addPendingToken(this.curToken); // WS token } break; default: @@ -228,19 +231,6 @@ private void insertIndentOrDedentToken(final int indentLength) { } } - private void handleSTRINGtoken() { // remove the \ escape sequences from the string literal - final String line_joinFreeStringLiteral = this.curToken.getText().replaceAll("\\\\\\r?\\n", ""); - if (this.curToken.getText().length() == line_joinFreeStringLiteral.length()) { - this.addPendingToken(this.curToken); - } else { - CommonToken originalSTRINGtoken = new CommonToken(this.curToken); // backup the original token - this.curToken.setText(line_joinFreeStringLiteral); - this.addPendingToken(this.curToken); // add the modified token with inline string literal - this.hideAndAddPendingToken(originalSTRINGtoken); // add the original token to the hidden channel - // this inserted hidden token allows to restore the original string literal with the \ escape sequences - } - } - private void handleFSTRING_MIDDLE_token() { // replace the double braces '{{' or '}}' to single braces and hide the second braces String fsMid = this.curToken.getText(); fsMid = fsMid.replaceAll("\\{\\{", "{_").replaceAll("}}", "}_"); // replace: {{ --> {_ and }} --> }_ @@ -248,7 +238,7 @@ private void handleFSTRING_MIDDLE_token() { // replace the double braces '{{' or for (String s : arrOfStr) { if (!s.isEmpty()) { this.createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, s, this.ffgToken); - String lastCharacter = s.substring(s.length() - 1); + final String lastCharacter = s.substring(s.length() - 1); if ("{}".contains(lastCharacter)) { this.createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.HIDDEN_CHANNEL, lastCharacter, this.ffgToken); // this inserted hidden token allows to restore the original f-string literal with the double braces @@ -261,7 +251,7 @@ private void handleFStringLexerModes() { // https://peps.python.org/pep-0498/#sp if (!this._modeStack.isEmpty()) { switch (this.curToken.getType()) { case PythonLexer.LBRACE: - this.pushMode(PythonLexer.DEFAULT_MODE); + this.pushMode(Lexer.DEFAULT_MODE); this.paren_or_bracket_openedStack.push(0); break; case PythonLexer.LPAR: @@ -291,7 +281,7 @@ private void handleFStringLexerModes() { // https://peps.python.org/pep-0498/#sp break; case PythonLexer.RBRACE: switch (this._mode) { - case PythonLexer.DEFAULT_MODE: + case Lexer.DEFAULT_MODE: case PythonLexer.SINGLE_QUOTE_FORMAT_SPECIFICATION_MODE: case PythonLexer.DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE: this.popMode(); @@ -339,36 +329,37 @@ private void handleEOFtoken() { this.addPendingToken(this.curToken); } - private void hideAndAddPendingToken(CommonToken cToken) { - cToken.setChannel(Token.HIDDEN_CHANNEL); - this.addPendingToken(cToken); + private void hideAndAddPendingToken(final Token tkn) { + CommonToken ctkn = new CommonToken(tkn); + ctkn.setChannel(Token.HIDDEN_CHANNEL); + this.addPendingToken(ctkn); } - private void createAndAddPendingToken(final int type, final int channel, final String text, Token baseToken) { - CommonToken cToken = new CommonToken(baseToken); - cToken.setType(type); - cToken.setChannel(channel); - cToken.setStopIndex(baseToken.getStartIndex() - 1); - cToken.setText(text == null - ? "<" + this.getVocabulary().getSymbolicName(type) + ">" - : text); + private void createAndAddPendingToken(final int ttype, final int channel, final String text, Token sampleToken) { + CommonToken ctkn = new CommonToken(sampleToken); + ctkn.setType(ttype); + ctkn.setChannel(channel); + ctkn.setStopIndex(sampleToken.getStartIndex() - 1); + ctkn.setText(text == null + ? "<" + this.getVocabulary().getDisplayName(ttype) + ">" + : text); - this.addPendingToken(cToken); + this.addPendingToken(ctkn); } - private void addPendingToken(final Token token) { + private void addPendingToken(final Token tkn) { // save the last pending token type because the pendingTokens linked list can be empty by the nextToken() - this.previousPendingTokenType = token.getType(); - if (token.getChannel() == Token.DEFAULT_CHANNEL) { + this.previousPendingTokenType = tkn.getType(); + if (tkn.getChannel() == Token.DEFAULT_CHANNEL) { this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType; } - this.pendingTokens.addLast(token); + this.pendingTokens.addLast(tkn); } - private int getIndentationLength(final String textWS) { // the textWS may contain spaces, tabs or form feeds + private int getIndentationLength(final String indentText) { // the indentText may contain spaces, tabs or form feeds final int TAB_LENGTH = 8; // the standard number of spaces to replace a tab to spaces int length = 0; - for (char ch : textWS.toCharArray()) { + for (char ch : indentText.toCharArray()) { switch (ch) { case ' ': this.wasSpaceIndentation = true; @@ -387,7 +378,7 @@ private int getIndentationLength(final String textWS) { // the textWS may contai if (this.wasTabIndentation && this.wasSpaceIndentation) { if (!(this.wasIndentationMixedWithSpacesAndTabs)) { this.wasIndentationMixedWithSpacesAndTabs = true; - return this.INVALID_LENGTH; // only for the first inconsistent indent + length = this.INVALID_LENGTH; // only for the first inconsistent indent } } return length; @@ -400,13 +391,7 @@ private void reportLexerError(final String errMsg) { private void reportError(final String errMsg) { this.reportLexerError(errMsg); - // the ERROR_TOKEN will raise an error in the parser - this.createAndAddPendingToken(PythonLexer.ERROR_TOKEN, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.ffgToken); - } - - @Override - public void reset() { - this.init(); - super.reset(); + // the ERRORTOKEN will raise an error in the parser + this.createAndAddPendingToken(PythonLexer.ERRORTOKEN, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.ffgToken); } } diff --git a/python/python3_12_1/JavaScript/PythonLexerBase.js b/python/python3_12_1/JavaScript/PythonLexerBase.js index 2a7bebaf95..2c1ea71d47 100644 --- a/python/python3_12_1/JavaScript/PythonLexerBase.js +++ b/python/python3_12_1/JavaScript/PythonLexerBase.js @@ -27,7 +27,7 @@ THE SOFTWARE. * */ -import { Token, CommonToken, Lexer } from "antlr4"; +import { Token, Lexer } from "antlr4"; import PythonLexer from "./PythonLexer.js"; export default class PythonLexerBase extends Lexer { @@ -51,17 +51,27 @@ export default class PythonLexerBase extends Lexer { this.wasSpaceIndentation; this.wasTabIndentation; this.wasIndentationMixedWithSpacesAndTabs; - const INVALID_LENGTH = -1; - + this.curToken; // current (under processing) token this.ffgToken; // following (look ahead) token - const ERR_TXT = " ERROR: "; + this.#init(); + } + + get #INVALID_LENGTH() { return -1; } + get #ERR_TXT() { return " ERROR: "; } - this.init(); + nextToken() { // reading the input stream until a return EOF + this.#checkNextToken(); + return this.pendingTokens.shift() /* .pollFirst() */; // add the queued token to the token stream } - init() { + reset() { + this.#init(); + super.reset(); + } + + #init() { this.indentLengthStack = []; this.pendingTokens = []; this.previousPendingTokenType = 0; @@ -75,16 +85,11 @@ export default class PythonLexerBase extends Lexer { this.ffgToken = null; } - nextToken() { // reading the input stream until a return EOF - this.checkNextToken(); - return this.pendingTokens.shift() /* .pollFirst() */; // add the queued token to the token stream - } - - checkNextToken() { + #checkNextToken() { if (this.previousPendingTokenType !== Token.EOF) { - this.setCurrentAndFollowingTokens(); + this.#setCurrentAndFollowingTokens(); if (this.indentLengthStack.length === 0) { // We're at the first token - this.handleStartOfInput(); + this.#handleStartOfInput(); } switch (this.curToken.type) { @@ -92,175 +97,158 @@ export default class PythonLexerBase extends Lexer { case PythonLexer.LSQB: case PythonLexer.LBRACE: this.opened++; - this.addPendingToken(this.curToken); + this.#addPendingToken(this.curToken); break; case PythonLexer.RPAR: case PythonLexer.RSQB: case PythonLexer.RBRACE: this.opened--; - this.addPendingToken(this.curToken); + this.#addPendingToken(this.curToken); break; case PythonLexer.NEWLINE: - this.handleNEWLINEtoken(); - break; - case PythonLexer.STRING: - this.handleSTRINGtoken(); + this.#handleNEWLINEtoken(); break; case PythonLexer.FSTRING_MIDDLE: - this.handleFSTRING_MIDDLE_token(); + this.#handleFSTRING_MIDDLE_token(); break; - case PythonLexer.ERROR_TOKEN: - this.reportLexerError(`token recognition error at: '${this.curToken.text}'`); - this.addPendingToken(this.curToken); + case PythonLexer.ERRORTOKEN: + this.#reportLexerError(`token recognition error at: '${this.curToken.text}'`); + this.#addPendingToken(this.curToken); break; case Token.EOF: - this.handleEOFtoken(); + this.#handleEOFtoken(); break; default: - this.addPendingToken(this.curToken); + this.#addPendingToken(this.curToken); } - this.handleFORMAT_SPECIFICATION_MODE(); + this.#handleFORMAT_SPECIFICATION_MODE(); } } - setCurrentAndFollowingTokens() { + #setCurrentAndFollowingTokens() { this.curToken = this.ffgToken == undefined ? - this.getCommonTokenByToken(super.nextToken()) : - this.getCommonTokenByToken(this.ffgToken); + super.nextToken() : + this.ffgToken; - this.handleFStringLexerModes(); + this.#handleFStringLexerModes(); this.ffgToken = this.curToken.type === Token.EOF ? this.curToken : - this.getCommonTokenByToken(super.nextToken()); + super.nextToken(); } // initialize the _indentLengthStack // hide the leading NEWLINE token(s) // if exists, find the first statement (not NEWLINE, not EOF token) that comes from the default channel // insert a leading INDENT token if necessary - handleStartOfInput() { + #handleStartOfInput() { // initialize the stack with a default 0 indentation length this.indentLengthStack.push(0); // this will never be popped off while (this.curToken.type !== Token.EOF) { if (this.curToken.channel === Token.DEFAULT_CHANNEL) { if (this.curToken.type === PythonLexer.NEWLINE) { // all the NEWLINE tokens must be ignored before the first statement - this.hideAndAddPendingToken(this.curToken); + this.#hideAndAddPendingToken(this.curToken); } else { // We're at the first statement - this.insertLeadingIndentToken(); - return; // continue the processing of the current token with checkNextToken() + this.#insertLeadingIndentToken(); + return; // continue the processing of the current token with #checkNextToken() } } else { - this.addPendingToken(this.curToken); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token + this.#addPendingToken(this.curToken); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token } - this.setCurrentAndFollowingTokens(); - } // continue the processing of the EOF token with checkNextToken() + this.#setCurrentAndFollowingTokens(); + } // continue the processing of the EOF token with #checkNextToken() } - insertLeadingIndentToken() { + #insertLeadingIndentToken() { if (this.previousPendingTokenType === PythonLexer.WS) { let prevToken = this.pendingTokens.at(- 1) /* .peekLast() */; // WS token - if (this.getIndentationLength(prevToken.text) !== 0) { // there is an "indentation" before the first statement + if (this.#getIndentationLength(prevToken.text) !== 0) { // there is an "indentation" before the first statement const errMsg = "first statement indented"; - this.reportLexerError(errMsg); + this.#reportLexerError(errMsg); // insert an INDENT token before the first statement to raise an 'unexpected indent' error later by the parser - this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.curToken); + this.#createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, this.#ERR_TXT + errMsg, this.curToken); } } } - handleNEWLINEtoken() { + #handleNEWLINEtoken() { if (this.opened > 0) { // We're in an implicit line joining, ignore the current NEWLINE token - this.hideAndAddPendingToken(this.curToken); + this.#hideAndAddPendingToken(this.curToken); } else { - let nlToken = this.getCommonTokenByToken(this.curToken); // save the current NEWLINE token + let nlToken = this.curToken.clone(); // save the current NEWLINE token const isLookingAhead = this.ffgToken.type === PythonLexer.WS; if (isLookingAhead) { - this.setCurrentAndFollowingTokens(); // set the next two tokens + this.#setCurrentAndFollowingTokens(); // set the next two tokens } switch (this.ffgToken.type) { - case PythonLexer.NEWLINE: // We're before a blank line - case PythonLexer.COMMENT: // We're before a comment - case PythonLexer.TYPE_COMMENT: // We're before a type comment - this.hideAndAddPendingToken(nlToken); + case PythonLexer.NEWLINE: // We're before a blank line + case PythonLexer.COMMENT: // We're before a comment + this.#hideAndAddPendingToken(nlToken); if (isLookingAhead) { - this.addPendingToken(this.curToken); // WS token + this.#addPendingToken(this.curToken); // WS token } break; default: - this.addPendingToken(nlToken); + this.#addPendingToken(nlToken); if (isLookingAhead) { // We're on whitespace(s) followed by a statement const indentationLength = this.ffgToken.type === Token.EOF ? - 0 : - this.getIndentationLength(this.curToken.text); + 0 : + this.#getIndentationLength(this.curToken.text); - if (indentationLength !== this.INVALID_LENGTH) { - this.addPendingToken(this.curToken); // WS token - this.insertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s) + if (indentationLength !== this.#INVALID_LENGTH) { + this.#addPendingToken(this.curToken); // WS token + this.#insertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s) } else { - this.reportError("inconsistent use of tabs and spaces in indentation"); + this.#reportError("inconsistent use of tabs and spaces in indentation"); } } else { // We're at a newline followed by a statement (there is no whitespace before the statement) - this.insertIndentOrDedentToken(0); // may insert DEDENT token(s) + this.#insertIndentOrDedentToken(0); // may insert DEDENT token(s) } } } } - insertIndentOrDedentToken(curIndentLength) { + #insertIndentOrDedentToken(curIndentLength) { let prevIndentLength = this.indentLengthStack.at(-1) /* peek() */; if (curIndentLength > prevIndentLength) { - this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken); + this.#createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken); this.indentLengthStack.push(curIndentLength); } else { while (curIndentLength < prevIndentLength) { // more than 1 DEDENT token may be inserted to the token stream this.indentLengthStack.pop(); prevIndentLength = this.indentLengthStack.at(-1) /* peek() */; if (curIndentLength <= prevIndentLength) { - this.createAndAddPendingToken(PythonLexer.DEDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken); + this.#createAndAddPendingToken(PythonLexer.DEDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken); } else { - this.reportError("inconsistent dedent"); + this.#reportError("inconsistent dedent"); } } } } - handleSTRINGtoken() { // remove the \ escape sequences from the string literal - const line_joinFreeStringLiteral = this.curToken.text.replace(/\\(\r?\n)/g, ""); - if (this.curToken.text.length === line_joinFreeStringLiteral.length) { - this.addPendingToken(this.curToken); - } else { - let originalSTRINGtoken = this.getCommonTokenByToken(this.curToken); // backup the original token - this.curToken.text = line_joinFreeStringLiteral; - this.addPendingToken(this.curToken); // add the modified token with inline string literal - this.hideAndAddPendingToken(originalSTRINGtoken); // add the original token to the hidden channel - // this inserted hidden token allows to restore the original string literal with the \ escape sequences - } - } - - handleFSTRING_MIDDLE_token() { // replace the double braces '{{' or '}}' to single braces and hide the second braces + #handleFSTRING_MIDDLE_token() { // replace the double braces '{{' or '}}' to single braces and hide the second braces let fsMid = this.curToken.text; fsMid = fsMid.replaceAll(/\{\{/g, "{_").replaceAll(/\}\}/g, "}_"); // replace: {{ --> {_ and }} --> }_ let arrOfStr = fsMid.split(/(?<=[{}])_/); // split by {_ or }_ for (let s of arrOfStr) { if (s) { - this.createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, s, this.ffgToken); + this.#createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, s, this.ffgToken); let lastCharacter = s.charAt(s.length - 1); if ("{}".includes(lastCharacter)) { - this.createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.HIDDEN_CHANNEL, lastCharacter, this.ffgToken); + this.#createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.HIDDEN_CHANNEL, lastCharacter, this.ffgToken); // this inserted hidden token allows to restore the original f-string literal with the double braces } } } } - handleFStringLexerModes() { // https://peps.python.org/pep-0498/#specification + #handleFStringLexerModes() { // https://peps.python.org/pep-0498/#specification if (this._modeStack.length > 0) { switch (this.curToken.type) { case PythonLexer.LBRACE: - this.pushMode(PythonLexer.DEFAULT_MODE); + this.pushMode(Lexer.DEFAULT_MODE); this.paren_or_bracket_openedStack.push(0); break; case PythonLexer.LPAR: @@ -278,26 +266,26 @@ export default class PythonLexerBase extends Lexer { case PythonLexer.SINGLE_QUOTE_FSTRING_MODE: case PythonLexer.LONG_SINGLE_QUOTE_FSTRING_MODE: case PythonLexer.SINGLE_QUOTE_FORMAT_SPECIFICATION_MODE: - this.mode(PythonLexer.SINGLE_QUOTE_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + this.setMode(PythonLexer.SINGLE_QUOTE_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode break; case PythonLexer.DOUBLE_QUOTE_FSTRING_MODE: case PythonLexer.LONG_DOUBLE_QUOTE_FSTRING_MODE: case PythonLexer.DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE: - this.mode(PythonLexer.DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + this.setMode(PythonLexer.DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode break; } } break; case PythonLexer.RBRACE: switch (this._mode) { - case PythonLexer.DEFAULT_MODE: + case Lexer.DEFAULT_MODE: case PythonLexer.SINGLE_QUOTE_FORMAT_SPECIFICATION_MODE: case PythonLexer.DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE: this.popMode(); this.paren_or_bracket_openedStack.pop(); break; default: - this.reportLexerError("f-string: single '}' is not allowed"); + this.#reportLexerError("f-string: single '}' is not allowed"); break; } break; @@ -305,78 +293,68 @@ export default class PythonLexerBase extends Lexer { } } - handleFORMAT_SPECIFICATION_MODE() { + #handleFORMAT_SPECIFICATION_MODE() { if (this._modeStack.length > 0 && this.ffgToken.type === PythonLexer.RBRACE) { switch (this.curToken.type) { case PythonLexer.COLON: case PythonLexer.RBRACE: // insert an empty FSTRING_MIDDLE token instead of the missing format specification - this.createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, "", this.ffgToken); + this.#createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, "", this.ffgToken); break; } } } - insertTrailingTokens() { + #insertTrailingTokens() { switch (this.lastPendingTokenTypeFromDefaultChannel) { case PythonLexer.NEWLINE: case PythonLexer.DEDENT: break; // no trailing NEWLINE token is needed default: // insert an extra trailing NEWLINE token that serves as the end of the last statement - this.createAndAddPendingToken(PythonLexer.NEWLINE, Token.DEFAULT_CHANNEL, null, this.ffgToken); // _ffgToken is EOF + this.#createAndAddPendingToken(PythonLexer.NEWLINE, Token.DEFAULT_CHANNEL, null, this.ffgToken); // _ffgToken is EOF } - this.insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed + this.#insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed } - handleEOFtoken() { + #handleEOFtoken() { if (this.lastPendingTokenTypeFromDefaultChannel > 0) { // there was a statement in the input (leading NEWLINE tokens are hidden) - this.insertTrailingTokens(); + this.#insertTrailingTokens(); } - this.addPendingToken(this.curToken); + this.#addPendingToken(this.curToken); } - hideAndAddPendingToken(cToken) { - cToken.channel = Token.HIDDEN_CHANNEL; - this.addPendingToken(cToken); + #hideAndAddPendingToken(ctkn) { + ctkn.channel = Token.HIDDEN_CHANNEL; + this.#addPendingToken(ctkn); } - createAndAddPendingToken(type, channel, text, baseToken) { - const cToken = this.getCommonTokenByToken(baseToken); - cToken.type = type; - cToken.channel = channel; - cToken.stop = baseToken.start - 1; - cToken.text = text == null ? + #createAndAddPendingToken(type, channel, text, sampleToken) { + const ctkn = sampleToken.clone(); + ctkn.type = type; + ctkn.channel = channel; + ctkn.stop = sampleToken.start - 1; + ctkn.text = text == null ? `<${this.getSymbolicNames()[type]}>` : text; - this.addPendingToken(cToken); + this.#addPendingToken(ctkn); } - addPendingToken(token) { + #addPendingToken(tkn) { // save the last pending token type because the _pendingTokens linked list can be empty by the nextToken() - this.previousPendingTokenType = token.type; - if (token.channel === Token.DEFAULT_CHANNEL) { + this.previousPendingTokenType = tkn.type; + if (tkn.channel === Token.DEFAULT_CHANNEL) { this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType; } - this.pendingTokens.push(token) /* .addLast(token) */; - } - - getCommonTokenByToken(oldToken) { - let commonToken = new CommonToken(oldToken.source, oldToken.type, oldToken.channel, oldToken.start, oldToken.stop); - commonToken.tokenIndex = oldToken.tokenIndex; - commonToken.line = oldToken.line; - commonToken.column = oldToken.column; - commonToken.text = oldToken.text; - return commonToken; + this.pendingTokens.push(tkn) /* .addLast(token) */; } - getIndentationLength(textWS) { // the textWS may contain spaces, tabs or form feeds + #getIndentationLength(indentText) { // the indentText may contain spaces, tabs or form feeds const TAB_LENGTH = 8; // the standard number of spaces to replace a tab to spaces let length = 0; - - for (let ch of textWS) { + for (let ch of indentText) { switch (ch) { case " ": this.wasSpaceIndentation = true; @@ -395,25 +373,20 @@ export default class PythonLexerBase extends Lexer { if (this.wasTabIndentation && this.wasSpaceIndentation) { if (!this.wasIndentationMixedWithSpacesAndTabs) { this.wasIndentationMixedWithSpacesAndTabs = true; - return this.INVALID_LENGTH; // only for the first inconsistent indent + length = this.#INVALID_LENGTH; // only for the first inconsistent indent } } return length; } - reportLexerError(errMsg) { - this.getErrorListenerDispatch().syntaxError(this, this.curToken, this.curToken.line, this.curToken.column, " LEXER" + this.ERR_TXT + errMsg, null); + #reportLexerError(errMsg) { + this.getErrorListener().syntaxError(this, this.curToken, this.curToken.line, this.curToken.column, " LEXER" + this.#ERR_TXT + errMsg, null); } - reportError(errMsg) { - this.reportLexerError(errMsg); - - // the ERROR_TOKEN will raise an error in the parser - this.createAndAddPendingToken(PythonLexer.ERROR_TOKEN, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.ffgToken); - } + #reportError(errMsg) { + this.#reportLexerError(errMsg); - reset() { - this.init(); - super.reset(); + // the ERRORTOKEN will raise an error in the parser + this.#createAndAddPendingToken(PythonLexer.ERRORTOKEN, Token.DEFAULT_CHANNEL, this.#ERR_TXT + errMsg, this.ffgToken); } } diff --git a/python/python3_12_1/Python3/PythonLexerBase.py b/python/python3_12_1/Python3/PythonLexerBase.py index 1926ab1ffc..d9a95ea764 100644 --- a/python/python3_12_1/Python3/PythonLexerBase.py +++ b/python/python3_12_1/Python3/PythonLexerBase.py @@ -21,7 +21,6 @@ # # Developed by : Robert Einhorn -from collections import deque from typing import TextIO from antlr4 import InputStream, Lexer, Token from antlr4.Token import CommonToken @@ -33,293 +32,278 @@ def __init__(self, input: InputStream, output: TextIO = sys.stdout): super().__init__(input, output) # A stack that keeps track of the indentation lengths - self.indent_length_stack: Deque[int] + self.__indent_length_stack: list[int] # A list where tokens are waiting to be loaded into the token stream - self.pending_tokens: list[CommonToken] + self.__pending_tokens: list[CommonToken] # last pending token types - self.previous_pending_token_type: int - self.last_pending_token_type_from_default_channel: int + self.__previous_pending_token_type: int + self.__last_pending_token_type_from_default_channel: int # The amount of opened parentheses, square brackets or curly braces - self.opened: int + self.__opened: int # The amount of opened parentheses and square brackets in the current lexer mode - self.paren_or_bracket_opened_stack: Deque[int] - - self.was_space_indentation: bool - self.was_tab_indentation: bool - self.was_indentation_mixed_with_spaces_and_tabs: bool - self.INVALID_LENGTH: int - - self.cur_token: CommonToken # current (under processing) token - self.ffg_token: CommonToken # following (look ahead) token - - self.ERR_TXT: str - - self.init() - - def init(self): - self.indent_length_stack = deque() - self.pending_tokens = [] - self.previous_pending_token_type = 0 - self.last_pending_token_type_from_default_channel = 0 - self.opened = 0 - self.paren_or_bracket_opened_stack = deque() - self.was_space_indentation = False - self.was_tab_indentation = False - self.was_indentation_mixed_with_spaces_and_tabs = False - self.INVALID_LENGTH = -1 - self.cur_token = None - self.ffg_token = None - self.ERR_TXT = " ERROR: " + self.__paren_or_bracket_opened_stack: list[int] + + self.__was_space_indentation: bool + self.__was_tab_indentation: bool + self.__was_indentation_mixed_with_spaces_and_tabs: bool + + self.__cur_token: CommonToken # current (under processing) token + self.__ffg_token: CommonToken # following (look ahead) token + + self.__INVALID_LENGTH: int = -1 + self.__ERR_TXT: str = " ERROR: " + + self.__init() def nextToken(self) -> CommonToken: # reading the input stream until a return EOF - self.check_next_token() - return self.pending_tokens.pop(0) # add the queued token to the token stream - - def check_next_token(self): - if self.previous_pending_token_type != Token.EOF: - self.set_current_and_following_tokens() - if len(self.indent_length_stack) == 0: # We're at the first token - self.handle_start_of_input() - match self.cur_token.type: + self.__check_next_token() + return self.__pending_tokens.pop(0) # add the queued token to the token stream + + def reset(self) -> None: + self.__init() + super().reset() + + def __init(self) -> None: + self.__indent_length_stack = [] + self.__pending_tokens = [] + self.__previous_pending_token_type = 0 + self.__last_pending_token_type_from_default_channel = 0 + self.__opened = 0 + self.__paren_or_bracket_opened_stack = [] + self.__was_space_indentation = False + self.__was_tab_indentation = False + self.__was_indentation_mixed_with_spaces_and_tabs = False + self.__cur_token = None + self.__ffg_token = None + + def __check_next_token(self) -> None: + if self.__previous_pending_token_type != Token.EOF: + self.__set_current_and_following_tokens() + if len(self.__indent_length_stack) == 0: # We're at the first token + self.__handle_start_of_input() + + match self.__cur_token.type: case self.LPAR | self.LSQB | self.LBRACE: - self.opened += 1 - self.add_pending_token(self.cur_token) + self.__opened += 1 + self.__add_pending_token(self.__cur_token) case self.RPAR | self.RSQB | self.RBRACE: - self.opened -= 1 - self.add_pending_token(self.cur_token) + self.__opened -= 1 + self.__add_pending_token(self.__cur_token) case self.NEWLINE: - self.handle_NEWLINE_token() - case self.STRING: - self.handle_STRING_token() + self.__handle_NEWLINE_token() case self.FSTRING_MIDDLE: - self.handle_FSTRING_MIDDLE_token() - case self.ERROR_TOKEN: - self.report_lexer_error("token recognition error at: '" + self.cur_token.text + "'") - self.add_pending_token(self.cur_token) + self.__handle_FSTRING_MIDDLE_token() + case self.ERRORTOKEN: + self.__report_lexer_error("token recognition error at: '" + self.__cur_token.text + "'") + self.__add_pending_token(self.__cur_token) case Token.EOF: - self.handle_EOF_token() + self.__handle_EOF_token() case other: - self.add_pending_token(self.cur_token) - self.handle_FORMAT_SPECIFICATION_MODE() + self.__add_pending_token(self.__cur_token) + self.__handle_FORMAT_SPECIFICATION_MODE() - def set_current_and_following_tokens(self): - self.cur_token = super().nextToken() if self.ffg_token is None else \ - self.ffg_token + def __set_current_and_following_tokens(self) -> None: + self.__cur_token = super().nextToken() if self.__ffg_token is None else \ + self.__ffg_token - self.handle_fstring_lexer_modes() + self.__handle_fstring_lexer_modes() - self.ffg_token = self.cur_token if self.cur_token.type == Token.EOF else \ - super().nextToken() + self.__ffg_token = self.__cur_token if self.__cur_token.type == Token.EOF else \ + super().nextToken() # initialize the _indent_length_stack # hide the leading NEWLINE token(s) # if exists, find the first statement (not NEWLINE, not EOF token) that comes from the default channel # insert a leading INDENT token if necessary - def handle_start_of_input(self): + def __handle_start_of_input(self) -> None: # initialize the stack with a default 0 indentation length - self.indent_length_stack.append(0) # this will never be popped off - while self.cur_token.type != Token.EOF: - if self.cur_token.channel == Token.DEFAULT_CHANNEL: - if self.cur_token.type == self.NEWLINE: + self.__indent_length_stack.append(0) # this will never be popped off + while self.__cur_token.type != Token.EOF: + if self.__cur_token.channel == Token.DEFAULT_CHANNEL: + if self.__cur_token.type == self.NEWLINE: # all the NEWLINE tokens must be ignored before the first statement - self.hide_and_add_pending_token(self.cur_token) + self.__hide_and_add_pending_token(self.__cur_token) else: # We're at the first statement - self.insert_leading_indent_token() - return # continue the processing of the current token with check_next_token() + self.__insert_leading_indent_token() + return # continue the processing of the current token with __check_next_token() else: - self.add_pending_token(self.cur_token) # it can be WS, EXPLICIT_LINE_JOINING or COMMENT token - self.set_current_and_following_tokens() - # continue the processing of the EOF token with check_next_token() - - def insert_leading_indent_token(self): - if self.previous_pending_token_type == self.WS: - prev_token: CommonToken = self.pending_tokens[-1] # WS token - if self.get_indentation_length(prev_token.text) != 0: # there is an "indentation" before the first statement + self.__add_pending_token(self.__cur_token) # it can be WS, EXPLICIT_LINE_JOINING or COMMENT token + self.__set_current_and_following_tokens() + # continue the processing of the EOF token with __check_next_token() + + def __insert_leading_indent_token(self) -> None: + if self.__previous_pending_token_type == self.WS: + prev_token: CommonToken = self.__pending_tokens[-1] # WS token + if self.__get_indentation_length(prev_token.text) != 0: # there is an "indentation" before the first statement err_msg: str = "first statement indented" - self.report_lexer_error(err_msg) + self.__report_lexer_error(err_msg) # insert an INDENT token before the first statement to raise an 'unexpected indent' error later by the parser - self.create_and_add_pending_token(self.INDENT, Token.DEFAULT_CHANNEL, self.ERR_TXT + err_msg, self.cur_token) + self.__create_and_add_pending_token(self.INDENT, Token.DEFAULT_CHANNEL, self.__ERR_TXT + err_msg, self.__cur_token) - def handle_NEWLINE_token(self): - if self.opened > 0: # We're in an implicit line joining, ignore the current NEWLINE token - self.hide_and_add_pending_token(self.cur_token) + def __handle_NEWLINE_token(self) -> None: + if self.__opened > 0: # We're in an implicit line joining, ignore the current NEWLINE token + self.__hide_and_add_pending_token(self.__cur_token) else: - nl_token: CommonToken = self.cur_token # save the current NEWLINE token - is_looking_ahead: bool = self.ffg_token.type == self.WS + nl_token: CommonToken = self.__cur_token.clone() # save the current NEWLINE token + is_looking_ahead: bool = self.__ffg_token.type == self.WS if is_looking_ahead: - self.set_current_and_following_tokens() # set the next two tokens + self.__set_current_and_following_tokens() # set the next two tokens - match self.ffg_token.type: - case self.NEWLINE | self.COMMENT | self.TYPE_COMMENT: - # We're before a blank line or a comment or a type comment - self.hide_and_add_pending_token(nl_token) # ignore the NEWLINE token + match self.__ffg_token.type: + case self.NEWLINE | self.COMMENT: + # We're before a blank line or a comment or type comment or a type ignore comment + self.__hide_and_add_pending_token(nl_token) # ignore the NEWLINE token if is_looking_ahead: - self.add_pending_token(self.cur_token) # WS token + self.__add_pending_token(self.__cur_token) # WS token case other: - self.add_pending_token(nl_token) + self.__add_pending_token(nl_token) if is_looking_ahead: # We're on a whitespace(s) followed by a statement - indentation_length: int = 0 if self.ffg_token.type == Token.EOF else \ - self.get_indentation_length(self.cur_token.text) + indentation_length: int = 0 if self.__ffg_token.type == Token.EOF else \ + self.__get_indentation_length(self.__cur_token.text) - if indentation_length != self.INVALID_LENGTH: - self.add_pending_token(self.cur_token) # WS token - self.insert_indent_or_dedent_token(indentation_length) # may insert INDENT token or DEDENT token(s) + if indentation_length != self.__INVALID_LENGTH: + self.__add_pending_token(self.__cur_token) # WS token + self.__insert_indent_or_dedent_token(indentation_length) # may insert INDENT token or DEDENT token(s) else: - self.report_error("inconsistent use of tabs and spaces in indentation") + self.__report_error("inconsistent use of tabs and spaces in indentation") else: # We're at a newline followed by a statement (there is no whitespace before the statement) - self.insert_indent_or_dedent_token(0) # may insert DEDENT token(s) + self.__insert_indent_or_dedent_token(0) # may insert DEDENT token(s) - def insert_indent_or_dedent_token(self, indent_length: int): - prev_indent_length: int = self.indent_length_stack[-1] # peek() + def __insert_indent_or_dedent_token(self, indent_length: int) -> None: + prev_indent_length: int = self.__indent_length_stack[-1] # peek() if indent_length > prev_indent_length: - self.create_and_add_pending_token(self.INDENT, Token.DEFAULT_CHANNEL, None, self.ffg_token) - self.indent_length_stack.append(indent_length) + self.__create_and_add_pending_token(self.INDENT, Token.DEFAULT_CHANNEL, None, self.__ffg_token) + self.__indent_length_stack.append(indent_length) else: while indent_length < prev_indent_length: # more than 1 DEDENT token may be inserted to the token stream - self.indent_length_stack.pop() - prev_indent_length = self.indent_length_stack[-1] # peek() + self.__indent_length_stack.pop() + prev_indent_length = self.__indent_length_stack[-1] # peek() if indent_length <= prev_indent_length: - self.create_and_add_pending_token(self.DEDENT, Token.DEFAULT_CHANNEL, None, self.ffg_token) + self.__create_and_add_pending_token(self.DEDENT, Token.DEFAULT_CHANNEL, None, self.__ffg_token) else: - self.report_error("inconsistent dedent") + self.__report_error("inconsistent dedent") - def handle_STRING_token(self): # remove the \ escape sequences from the string literal - # https://docs.python.org/3.11/reference/lexical_analysis.html#string-and-bytes-literals - line_joinFreeStringLiteral: str = re.sub(r"\\\r?\n", "", self.cur_token.text) - if len(self.cur_token.text) == len(line_joinFreeStringLiteral): - self.add_pending_token(self.cur_token) - else: - originalSTRINGtoken: CommonToken = self.cur_token.clone() # backup the original token - self.cur_token.text = line_joinFreeStringLiteral - self.add_pending_token(self.cur_token) # add the modified token with inline string literal - self.hide_and_add_pending_token(originalSTRINGtoken) # add the original token to the hidden channel - # this inserted hidden token allows to restore the original string literal with the \ escape sequences - - def handle_FSTRING_MIDDLE_token(self): # replace the double braces '{{' or '}}' to single braces and hide the second braces - fs_mid: str = self.cur_token.text + def __handle_FSTRING_MIDDLE_token(self) -> None: # replace the double braces '{{' or '}}' to single braces and hide the second braces + fs_mid: str = self.__cur_token.text fs_mid = fs_mid.replace("{{", "{_").replace("}}", "}_") # replace: {{ --> {_ and }} --> }_ - arrOfStr: list[str] = re.split(r"(?<=[{}])_", fs_mid) # split by {_ or }_ + arr_of_str: list[str] = re.split(r"(?<=[{}])_", fs_mid) # split by {_ or }_ s: str - for s in arrOfStr: + for s in arr_of_str: if s: - self.create_and_add_pending_token(self.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, s, self.ffg_token) - lastCharacter: str = s[-1:] - if lastCharacter in "{}": - self.create_and_add_pending_token(self.FSTRING_MIDDLE, Token.HIDDEN_CHANNEL, lastCharacter, self.ffg_token) + self.__create_and_add_pending_token(self.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, s, self.__ffg_token) + last_character: str = s[-1:] + if last_character in "{}": + self.__create_and_add_pending_token(self.FSTRING_MIDDLE, Token.HIDDEN_CHANNEL, last_character, self.__ffg_token) - def handle_fstring_lexer_modes(self): + def __handle_fstring_lexer_modes(self) -> None: if self._modeStack: - match self.cur_token.type: + match self.__cur_token.type: case self.LBRACE: self.pushMode(Lexer.DEFAULT_MODE) - self.paren_or_bracket_opened_stack.append(0) + self.__paren_or_bracket_opened_stack.append(0) case self.LPAR | self.LSQB: # https://peps.python.org/pep-0498/#lambdas-inside-expressions - self.paren_or_bracket_opened_stack[-1] += 1 # increment the last element (peek() + 1) + self.__paren_or_bracket_opened_stack[-1] += 1 # increment the last element (peek() + 1) case self.RPAR | self.RSQB: - self.paren_or_bracket_opened_stack[-1] -= 1 # decrement the last element (peek() - 1) + self.__paren_or_bracket_opened_stack[-1] -= 1 # decrement the last element (peek() - 1) case self.COLON: - if self.paren_or_bracket_opened_stack[-1] == 0: + if self.__paren_or_bracket_opened_stack[-1] == 0: match self._modeStack[-1]: # check the previous lexer mode (the current is DEFAULT_MODE) case self.SINGLE_QUOTE_FSTRING_MODE \ - | self.LONG_SINGLE_QUOTE_FSTRING_MODE \ - | self.SINGLE_QUOTE_FORMAT_SPECIFICATION_MODE: + | self.LONG_SINGLE_QUOTE_FSTRING_MODE \ + | self.SINGLE_QUOTE_FORMAT_SPECIFICATION_MODE: self.mode(self.SINGLE_QUOTE_FORMAT_SPECIFICATION_MODE) # continue in format spec. mode case self.DOUBLE_QUOTE_FSTRING_MODE \ - | self.LONG_DOUBLE_QUOTE_FSTRING_MODE \ - | self.DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE: + | self.LONG_DOUBLE_QUOTE_FSTRING_MODE \ + | self.DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE: self.mode(self.DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE) # continue in format spec. mode case self.RBRACE: match self._mode: case Lexer.DEFAULT_MODE \ - | self.SINGLE_QUOTE_FORMAT_SPECIFICATION_MODE \ - | self.DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE: + | self.SINGLE_QUOTE_FORMAT_SPECIFICATION_MODE \ + | self.DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE: self.popMode() - self.paren_or_bracket_opened_stack.pop() + self.__paren_or_bracket_opened_stack.pop() case other: - self.report_lexer_error("f-string: single '}' is not allowed") + self.__report_lexer_error("f-string: single '}' is not allowed") - def handle_FORMAT_SPECIFICATION_MODE(self): + def __handle_FORMAT_SPECIFICATION_MODE(self) -> None: if len(self._modeStack) != 0 \ - and self.ffg_token.type == self.RBRACE: + and self.__ffg_token.type == self.RBRACE: - match self.cur_token.type: + match self.__cur_token.type: case self.COLON | self.RBRACE: # insert an empty FSTRING_MIDDLE token instead of the missing format specification - self.create_and_add_pending_token(self.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, "", self.ffg_token) + self.__create_and_add_pending_token(self.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, "", self.__ffg_token) - def insert_trailing_tokens(self): - match self.last_pending_token_type_from_default_channel: + def __insert_trailing_tokens(self) -> None: + match self.__last_pending_token_type_from_default_channel: case self.NEWLINE | self.DEDENT: pass # no trailing NEWLINE token is needed case other: # insert an extra trailing NEWLINE token that serves as the end of the last statement - self.create_and_add_pending_token(self.NEWLINE, Token.DEFAULT_CHANNEL, None, self.ffg_token) # _ffg_token is EOF - self.insert_indent_or_dedent_token(0) # Now insert as much trailing DEDENT tokens as needed + self.__create_and_add_pending_token(self.NEWLINE, Token.DEFAULT_CHANNEL, None, self.__ffg_token) # _ffg_token is EOF + self.__insert_indent_or_dedent_token(0) # Now insert as much trailing DEDENT tokens as needed - def handle_EOF_token(self): - if self.last_pending_token_type_from_default_channel > 0: + def __handle_EOF_token(self) -> None: + if self.__last_pending_token_type_from_default_channel > 0: # there was statement in the input (leading NEWLINE tokens are hidden) - self.insert_trailing_tokens() - self.add_pending_token(self.cur_token) + self.__insert_trailing_tokens() + self.__add_pending_token(self.__cur_token) - def hide_and_add_pending_token(self, cToken: CommonToken): - cToken.channel = Token.HIDDEN_CHANNEL - self.add_pending_token(cToken) + def __hide_and_add_pending_token(self, ctkn: CommonToken) -> None: + ctkn.channel = Token.HIDDEN_CHANNEL + self.__add_pending_token(ctkn) - def create_and_add_pending_token(self, type: int, channel: int, text: str, base_token: CommonToken): - cToken: CommonToken = base_token.clone() - cToken.type = type - cToken.channel = channel - cToken.stop = base_token.start - 1 - cToken.text = "<" + self.symbolicNames[type] + ">" if text is None else \ - text + def __create_and_add_pending_token(self, ttype: int, channel: int, text: str, sample_token: CommonToken) -> None: + ctkn: CommonToken = sample_token.clone() + ctkn.type = ttype + ctkn.channel = channel + ctkn.stop = sample_token.start - 1 + ctkn.text = "<" + self.symbolicNames[ttype] + ">" if text is None else \ + text - self.add_pending_token(cToken) + self.__add_pending_token(ctkn) - def add_pending_token(self, token: CommonToken): + def __add_pending_token(self, ctkn: CommonToken) -> None: # save the last pending token type because the _pending_tokens list can be empty by the nextToken() - self.previous_pending_token_type = token.type - if token.channel == Token.DEFAULT_CHANNEL: - self.last_pending_token_type_from_default_channel = self.previous_pending_token_type - self.pending_tokens.append(token) + self.__previous_pending_token_type = ctkn.type + if ctkn.channel == Token.DEFAULT_CHANNEL: + self.__last_pending_token_type_from_default_channel = self.__previous_pending_token_type + self.__pending_tokens.append(ctkn) - def get_indentation_length(self, textWS: str) -> int: # the textWS may contain spaces, tabs or form feeds + def __get_indentation_length(self, indentText: str) -> int: # the indentText may contain spaces, tabs or form feeds TAB_LENGTH: int = 8 # the standard number of spaces to replace a tab to spaces length: int = 0 ch: str - for ch in textWS: + for ch in indentText: match ch: case ' ': - self.was_space_indentation = True + self.__was_space_indentation = True length += 1 case '\t': - self.was_tab_indentation = True + self.__was_tab_indentation = True length += TAB_LENGTH - (length % TAB_LENGTH) case '\f': # form feed length = 0 - if self.was_tab_indentation and self.was_space_indentation: - if not self.was_indentation_mixed_with_spaces_and_tabs: - self.was_indentation_mixed_with_spaces_and_tabs = True - return self.INVALID_LENGTH # only for the first inconsistent indent + if self.__was_tab_indentation and self.__was_space_indentation: + if not self.__was_indentation_mixed_with_spaces_and_tabs: + self.__was_indentation_mixed_with_spaces_and_tabs = True + length = self.__INVALID_LENGTH # only for the first inconsistent indent return length - def report_lexer_error(self, err_msg): - self.getErrorListenerDispatch().syntaxError(self, self.cur_token, self.cur_token.line, self.cur_token.column, " LEXER" + self.ERR_TXT + err_msg, None) + def __report_lexer_error(self, err_msg: str) -> None: + self.getErrorListenerDispatch().syntaxError(self, self.__cur_token, self.__cur_token.line, self.__cur_token.column, " LEXER" + self.__ERR_TXT + err_msg, None) - def report_error(self, err_msg): - self.report_lexer_error(err_msg) + def __report_error(self, err_msg: str) -> None: + self.__report_lexer_error(err_msg) - # the ERROR_TOKEN will raise an error in the parser - self.create_and_add_pending_token(self.ERROR_TOKEN, Token.DEFAULT_CHANNEL, self.ERR_TXT + err_msg, self.ffg_token) - - def reset(self): - self.init() - super().reset() + # the ERRORTOKEN will raise an error in the parser + self.__create_and_add_pending_token(self.ERRORTOKEN, Token.DEFAULT_CHANNEL, self.__ERR_TXT + err_msg, self.__ffg_token) diff --git a/python/python3_12_1/Python3_12_1_official_grammar.peg b/python/python3_12_1/Python3_12_6_official_grammar.peg similarity index 99% rename from python/python3_12_1/Python3_12_1_official_grammar.peg rename to python/python3_12_1/Python3_12_6_official_grammar.peg index 49d4a9ad34..8714b514d1 100644 --- a/python/python3_12_1/Python3_12_1_official_grammar.peg +++ b/python/python3_12_1/Python3_12_6_official_grammar.peg @@ -1,3 +1,7 @@ +# PEG grammar for Python + + + # ========================= START OF THE GRAMMAR ========================= # General grammatical elements and rules: @@ -494,9 +498,7 @@ type_param_seq: ','.type_param+ [','] type_param: | NAME [type_param_bound] - | '*' NAME ':' expression | '*' NAME - | '**' NAME ':' expression | '**' NAME type_param_bound: ':' expression diff --git a/python/python3_12_1/PythonLexer.g4 b/python/python3_12_1/PythonLexer.g4 index e3a5ed3ecb..ead8b3c89e 100644 --- a/python/python3_12_1/PythonLexer.g4 +++ b/python/python3_12_1/PythonLexer.g4 @@ -32,6 +32,7 @@ options { superClass=PythonLexerBase; } tokens { INDENT, DEDENT // https://docs.python.org/3.12/reference/lexical_analysis.html#indentation + , TYPE_COMMENT // not supported, only for compatibility with the PythonParser.g4 grammar , FSTRING_START, FSTRING_MIDDLE, FSTRING_END // https://peps.python.org/pep-0701/#specification } @@ -147,15 +148,8 @@ STRING | BYTES_LITERAL ; -// https://peps.python.org/pep-0484/#type-comments -TYPE_COMMENT - : '#' WS? 'type:' ~[\r\n]* - ; - // https://docs.python.org/3.12/reference/lexical_analysis.html#physical-lines -NEWLINE - : OS_INDEPENDENT_NL - ; +NEWLINE : '\r'? '\n'; // Unix, Windows // https://docs.python.org/3.12/reference/lexical_analysis.html#comments COMMENT : '#' ~[\r\n]* -> channel(HIDDEN); @@ -172,7 +166,7 @@ DOUBLE_QUOTE_FSTRING_START : F_STRING_PREFIX ["] -> type(FSTRING_STAR LONG_SINGLE_QUOTE_FSTRING_START : F_STRING_PREFIX ['][']['] -> type(FSTRING_START), pushMode(LONG_SINGLE_QUOTE_FSTRING_MODE); LONG_DOUBLE_QUOTE_FSTRING_START : F_STRING_PREFIX ["]["]["] -> type(FSTRING_START), pushMode(LONG_DOUBLE_QUOTE_FSTRING_MODE); -ERROR_TOKEN : . ; // catch the unrecognized characters and redirect these errors to the parser +ERRORTOKEN : . ; // catch the unrecognized characters and redirect these errors to the parser /* @@ -214,6 +208,8 @@ mode DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE; // only used after a format specifi * fragments */ +fragment IGNORE: 'ignore'; + // https://docs.python.org/3.12/reference/lexical_analysis.html#literals // https://docs.python.org/3.12/reference/lexical_analysis.html#string-and-bytes-literals @@ -240,10 +236,10 @@ fragment SHORT_STRING_CHAR_NO_DOUBLE_QUOTE : ~[\\\r\n"]; // -fragment STRING_ESCAPE_SEQ - : '\\' OS_INDEPENDENT_NL // \ escape sequence - | '\\' . // "\" - ; // the \ (not \n) escape sequences will be removed from the string literals by the PythonLexerBase class +fragment STRING_ESCAPE_SEQ // https://docs.python.org/3/reference/lexical_analysis.html#escape-sequences + : '\\' '\r' '\n' // for the two-character Windows line break: \ escape sequence (string literal line continuation) + | '\\' . // "\" + ; fragment BYTES_LITERAL : BYTES_PREFIX (SHORT_BYTES | LONG_BYTES); fragment BYTES_PREFIX : 'b' | 'B' | 'br' | 'Br' | 'bR' | 'BR' | 'rb' | 'rB' | 'Rb' | 'RB'; @@ -315,9 +311,6 @@ fragment EXPONENT : ('e' | 'E') ('+' | '-')? DIGIT_PART; // https://docs.python.org/3.12/reference/lexical_analysis.html#imaginary-literals fragment IMAG_NUMBER : (FLOAT_NUMBER | DIGIT_PART) ('j' | 'J'); -// https://docs.python.org/3.12/reference/lexical_analysis.html#physical-lines -fragment OS_INDEPENDENT_NL : '\r'? '\n'; // Unix, Windows - // https://github.com/RobEin/ANTLR4-parser-for-Python-3.12/tree/main/valid_chars_in_py_identifiers fragment ID_CONTINUE: ID_START diff --git a/python/python3_12_1/PythonParser.g4 b/python/python3_12_1/PythonParser.g4 index c245eb5711..fc86feac7c 100644 --- a/python/python3_12_1/PythonParser.g4 +++ b/python/python3_12_1/PythonParser.g4 @@ -27,7 +27,7 @@ THE SOFTWARE. * */ -parser grammar PythonParser; // Python 3.12.1 https://docs.python.org/3.12/reference/grammar.html#full-grammar-specification +parser grammar PythonParser; // Python 3.12.6 https://docs.python.org/3.12/reference/grammar.html#full-grammar-specification options { tokenVocab=PythonLexer; superClass=PythonParserBase; @@ -475,8 +475,8 @@ type_param_seq: type_param (',' type_param)* ','?; type_param : NAME type_param_bound? - | '*' NAME (':' expression)? - | '**' NAME (':' expression)? + | '*' NAME + | '**' NAME ; diff --git a/python/python3_12_1/README.md b/python/python3_12_1/README.md index ca6cb76b5e..ffe8f60c50 100644 --- a/python/python3_12_1/README.md +++ b/python/python3_12_1/README.md @@ -1,4 +1,4 @@ -# Python 3.12.1 parser +# Python 3.12.6 parser ### About files: - PythonParser.g4 diff --git a/python/python3_12_1/TypeScript/PythonLexerBase.ts b/python/python3_12_1/TypeScript/PythonLexerBase.ts new file mode 100644 index 0000000000..ce72f1782d --- /dev/null +++ b/python/python3_12_1/TypeScript/PythonLexerBase.ts @@ -0,0 +1,392 @@ +/* +The MIT License (MIT) +Copyright (c) 2021 Robert Einhorn + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + */ + +/* + * + * Project : Python Indent/Dedent handler for ANTLR4 grammars + * + * Developed by : Robert Einhorn, robert.einhorn.hu@gmail.com + * + */ + +import { CharStream, Token, Lexer } from "antlr4"; +import PythonLexer from "./PythonLexer"; +import * as Collections from "typescript-collections"; + +export default abstract class PythonLexerBase extends Lexer { + // A stack that keeps track of the indentation lengths + private indentLengthStack!: Collections.Stack; + // A list where tokens are waiting to be loaded into the token stream + private pendingTokens!: Array; + + // last pending token types + private previousPendingTokenType!: number; + private lastPendingTokenTypeFromDefaultChannel!: number; + + // The amount of opened parentheses, square brackets or curly braces + private opened!: number; + // The amount of opened parentheses and square brackets in the current lexer mode + private paren_or_bracket_openedStack!: Collections.Stack; + + private wasSpaceIndentation!: boolean; + private wasTabIndentation!: boolean; + private wasIndentationMixedWithSpacesAndTabs!: boolean; + + private curToken: Token | undefined; // current (under processing) token + private ffgToken: Token | undefined; // following (look ahead) token + + private readonly INVALID_LENGTH: number = -1; + private readonly ERR_TXT: string = " ERROR: "; + + protected constructor(input: CharStream) { + super(input); + this.init(); + } + + public nextToken(): Token { // reading the input stream until a return EOF + this.checkNextToken(); + return this.pendingTokens.shift()! /* .pollFirst() */; // add the queued token to the token stream + } + + public reset(): void { + this.init(); + super.reset(); + } + + private init(): void { + this.indentLengthStack = new Collections.Stack(); + this.pendingTokens = []; + this.previousPendingTokenType = 0; + this.lastPendingTokenTypeFromDefaultChannel = 0; + this.opened = 0; + this.paren_or_bracket_openedStack = new Collections.Stack(); + this.wasSpaceIndentation = false; + this.wasTabIndentation = false; + this.wasIndentationMixedWithSpacesAndTabs = false; + this.curToken = undefined; + this.ffgToken = undefined; + } + + private checkNextToken(): void { + if (this.previousPendingTokenType !== PythonLexer.EOF) { + this.setCurrentAndFollowingTokens(); + if (this.indentLengthStack.isEmpty()) { // We're at the first token + this.handleStartOfInput(); + } + + switch (this.curToken!.type) { + case PythonLexer.LPAR: + case PythonLexer.LSQB: + case PythonLexer.LBRACE: + this.opened++; + this.addPendingToken(this.curToken!); + break; + case PythonLexer.RPAR: + case PythonLexer.RSQB: + case PythonLexer.RBRACE: + this.opened--; + this.addPendingToken(this.curToken!); + break; + case PythonLexer.NEWLINE: + this.handleNEWLINEtoken(); + break; + case PythonLexer.FSTRING_MIDDLE: + this.handleFSTRING_MIDDLE_token(); + break; + case PythonLexer.ERRORTOKEN: + this.reportLexerError(`token recognition error at: '${this.curToken!.text}'`); + this.addPendingToken(this.curToken!); + break; + case PythonLexer.EOF: + this.handleEOFtoken(); + break; + default: + this.addPendingToken(this.curToken!); + } + this.handleFORMAT_SPECIFICATION_MODE(); + } + } + + private setCurrentAndFollowingTokens(): void { + this.curToken = this.ffgToken == undefined + ? super.nextToken() + : this.ffgToken; + + this.handleFStringLexerModes(); + + this.ffgToken = this.curToken.type === PythonLexer.EOF + ? this.curToken + : super.nextToken(); + } + + // initialize the indentLengthStack + // hide the leading NEWLINE token(s) + // if exists, find the first statement (not NEWLINE, not EOF token) that comes from the default channel + // insert a leading INDENT token if necessary + private handleStartOfInput(): void { + // initialize the stack with a default 0 indentation length + this.indentLengthStack.push(0); // this will never be popped off + while (this.curToken!.type !== PythonLexer.EOF) { + if (this.curToken!.channel === Token.DEFAULT_CHANNEL) { + if (this.curToken!.type === PythonLexer.NEWLINE) { + // all the NEWLINE tokens must be ignored before the first statement + this.hideAndAddPendingToken(this.curToken!); + } else { // We're at the first statement + this.insertLeadingIndentToken(); + return; // continue the processing of the current token with checkNextToken() + } + } else { + this.addPendingToken(this.curToken!); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token + } + this.setCurrentAndFollowingTokens(); + } // continue the processing of the EOF token with checkNextToken() + } + + private insertLeadingIndentToken(): void { + if (this.previousPendingTokenType === PythonLexer.WS) { + const prevToken: Token = this.pendingTokens[this.pendingTokens.length - 1] /* .peekLast() */; // WS token + if (this.getIndentationLength(prevToken.text) !== 0) { // there is an "indentation" before the first statement + const errMsg: string = "first statement indented"; + this.reportLexerError(errMsg); + // insert an INDENT token before the first statement to raise an 'unexpected indent' error later by the parser + this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.curToken!); + } + } + } + + private handleNEWLINEtoken(): void { + if (this.opened > 0) { // We're in an implicit line joining, ignore the current NEWLINE token + this.hideAndAddPendingToken(this.curToken!); + } else { + const nlToken: Token = this.curToken?.clone()!; // save the current NEWLINE token + const isLookingAhead: boolean = this.ffgToken!.type === PythonLexer.WS; + if (isLookingAhead) { + this.setCurrentAndFollowingTokens(); // set the next two tokens + } + + switch (this.ffgToken!.type) { + case PythonLexer.NEWLINE: // We're before a blank line + case PythonLexer.COMMENT: // We're before a comment + this.hideAndAddPendingToken(nlToken); + if (isLookingAhead) { + this.addPendingToken(this.curToken!); // WS token + } + break; + default: + this.addPendingToken(nlToken); + if (isLookingAhead) { // We're on whitespace(s) followed by a statement + const indentationLength: number = this.ffgToken!.type === PythonLexer.EOF ? + 0 : + this.getIndentationLength(this.curToken!.text); + + if (indentationLength !== this.INVALID_LENGTH) { + this.addPendingToken(this.curToken!); // WS token + this.insertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s) + } else { + this.reportError("inconsistent use of tabs and spaces in indentation"); + } + } else { // We're at a newline followed by a statement (there is no whitespace before the statement) + this.insertIndentOrDedentToken(0); // may insert DEDENT token(s) + } + } + } + } + + private insertIndentOrDedentToken(indentLength: number): void { + let prevIndentLength: number = this.indentLengthStack.peek()!; + if (indentLength > prevIndentLength) { + this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken!); + this.indentLengthStack.push(indentLength); + } else { + while (indentLength < prevIndentLength) { // more than 1 DEDENT token may be inserted to the token stream + this.indentLengthStack.pop(); + prevIndentLength = this.indentLengthStack.peek()!; + if (indentLength <= prevIndentLength) { + this.createAndAddPendingToken(PythonLexer.DEDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken!); + } else { + this.reportError("inconsistent dedent"); + } + } + } + } + + private handleFSTRING_MIDDLE_token(): void { // replace the double braces '{{' or '}}' to single braces and hide the second braces + let fsMid: string = this.curToken!.text; + fsMid = fsMid.replace(/\{\{/g, "{_").replace(/\}\}/g, "}_"); // replace: {{ --> {_ and }} --> }_ + const arrOfStr: string[] = fsMid.split(/(?<=[{}])_/); // split by {_ or }_ + for (let s of arrOfStr) { + if (s) { + this.createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, s, this.ffgToken!); + const lastCharacter: string = s.charAt(s.length - 1); + if ("{}".includes(lastCharacter)) { + this.createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.HIDDEN_CHANNEL, lastCharacter, this.ffgToken!); + // this inserted hidden token allows to restore the original f-string literal with the double braces + } + } + } + } + + private handleFStringLexerModes(): void { // https://peps.python.org/pep-0498/#specification + if (this.getModeStack().length > 0) { + switch (this.curToken!.type) { + case PythonLexer.LBRACE: + this.pushMode(Lexer.DEFAULT_MODE); + this.paren_or_bracket_openedStack.push(0); + break; + case PythonLexer.LPAR: + case PythonLexer.LSQB: + // https://peps.python.org/pep-0498/#lambdas-inside-expressions + this.paren_or_bracket_openedStack.push(this.paren_or_bracket_openedStack.pop()! + 1); // increment the last element + break; + case PythonLexer.RPAR: + case PythonLexer.RSQB: + this.paren_or_bracket_openedStack.push(this.paren_or_bracket_openedStack.pop()! - 1); // decrement the last element + break; + case PythonLexer.COLON: // colon can only come from DEFAULT_MODE + if (this.paren_or_bracket_openedStack.peek() == 0) { + switch (this.getModeStack().at(-1) /* peek() */) { // check the previous lexer mode (the current is DEFAULT_MODE) + case PythonLexer.SINGLE_QUOTE_FSTRING_MODE: + case PythonLexer.LONG_SINGLE_QUOTE_FSTRING_MODE: + case PythonLexer.SINGLE_QUOTE_FORMAT_SPECIFICATION_MODE: + this.setMode(PythonLexer.SINGLE_QUOTE_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + case PythonLexer.DOUBLE_QUOTE_FSTRING_MODE: + case PythonLexer.LONG_DOUBLE_QUOTE_FSTRING_MODE: + case PythonLexer.DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE: + this.setMode(PythonLexer.DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + } + } + break; + case PythonLexer.RBRACE: + switch (this.getMode()) { + case Lexer.DEFAULT_MODE: + case PythonLexer.SINGLE_QUOTE_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE: + this.popMode(); + this.paren_or_bracket_openedStack.pop(); + break; + default: + this.reportLexerError("f-string: single '}' is not allowed"); + break; + } + break; + } + } + } + + private handleFORMAT_SPECIFICATION_MODE(): void { + if (this.getModeStack().length > 0 && this.ffgToken!.type === PythonLexer.RBRACE) { + switch (this.curToken!.type) { + case PythonLexer.COLON: + case PythonLexer.RBRACE: + // insert an empty FSTRING_MIDDLE token instead of the missing format specification + this.createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, "", this.ffgToken!); + break; + } + } + } + + private insertTrailingTokens(): void { + switch (this.lastPendingTokenTypeFromDefaultChannel) { + case PythonLexer.NEWLINE: + case PythonLexer.DEDENT: + break; // no trailing NEWLINE token is needed + default: + // insert an extra trailing NEWLINE token that serves as the end of the last statement + this.createAndAddPendingToken(PythonLexer.NEWLINE, Token.DEFAULT_CHANNEL, null, this.ffgToken!); // ffgToken is EOF + } + this.insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed + } + + private handleEOFtoken(): void { + if (this.lastPendingTokenTypeFromDefaultChannel > 0) { + // there was a statement in the input (leading NEWLINE tokens are hidden) + this.insertTrailingTokens(); + } + this.addPendingToken(this.curToken!); + } + + private hideAndAddPendingToken(tkn: Token): void { + tkn.channel = Token.HIDDEN_CHANNEL; + this.addPendingToken(tkn); + } + + private createAndAddPendingToken(type: number, channel: number, text: string | null, sampleToken: Token): void { + const tkn: Token = sampleToken.clone(); + tkn.type = type; + tkn.channel = channel; + tkn.stop = sampleToken.start - 1; + tkn.text = text == null ? + `<${this.getSymbolicNames()[type]}>` : + text; + + this.addPendingToken(tkn); + } + + private addPendingToken(tkn: Token): void { + // save the last pending token type because the pendingTokens linked list can be empty by the nextToken() + this.previousPendingTokenType = tkn.type; + if (tkn.channel === Token.DEFAULT_CHANNEL) { + this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType; + } + this.pendingTokens.push(tkn) /* .addLast(token) */; + } + + private getIndentationLength(indentText: string): number { // the indentText may contain spaces, tabs or form feeds + const TAB_LENGTH: number = 8; // the standard number of spaces to replace a tab to spaces + let length: number = 0; + for (let ch of indentText) { + switch (ch) { + case " ": + this.wasSpaceIndentation = true; + length += 1; + break; + case "\t": + this.wasTabIndentation = true; + length += TAB_LENGTH - (length % TAB_LENGTH); + break; + case "\f": // form feed + length = 0; + break; + } + } + + if (this.wasTabIndentation && this.wasSpaceIndentation) { + if (!this.wasIndentationMixedWithSpacesAndTabs) { + this.wasIndentationMixedWithSpacesAndTabs = true; + length = this.INVALID_LENGTH; // only for the first inconsistent indent + } + } + return length; + } + + private reportLexerError(errMsg: string): void { + this.getErrorListener().syntaxError(this, 0 /* this.curToken */, this.curToken!.line, this.curToken!.column, " LEXER" + this.ERR_TXT + errMsg, undefined); + } + + private reportError(errMsg: string): void { + this.reportLexerError(errMsg); + + // the ERRORTOKEN will raise an error in the parser + this.createAndAddPendingToken(PythonLexer.ERRORTOKEN, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.ffgToken!); + } +} diff --git a/python/python3_12_1/TypeScript/PythonParserBase.ts b/python/python3_12_1/TypeScript/PythonParserBase.ts new file mode 100644 index 0000000000..a21b3117af --- /dev/null +++ b/python/python3_12_1/TypeScript/PythonParserBase.ts @@ -0,0 +1,16 @@ +import { Parser, TokenStream } from "antlr4"; +//import antlr4 from "antlr4"; + +export default class PythonParserBase extends Parser { + constructor(input: TokenStream) { + super(input); + } + + isEqualToCurrentTokenText(tokenText: string): boolean { + return this.getCurrentToken().text === tokenText; + } + + isnotEqualToCurrentTokenText(tokenText: string): boolean { + return !this.isEqualToCurrentTokenText(tokenText); // for compatibility with the Python 'not' logical operator + } +} diff --git a/python/python3_12_1/changes.txt b/python/python3_12_1/changes.txt new file mode 100644 index 0000000000..b9f4d706f5 --- /dev/null +++ b/python/python3_12_1/changes.txt @@ -0,0 +1,7 @@ +Szept 05, 2024 +-------------- +Type comment tokens are no longer generated. +Type comments will now be tokenized as plain comment tokens. + +Line continuation for string literals (backslash followed by a newline) is no longer resolved. +(backslash+newline is no longer removed from string literals) diff --git a/python/python3_12_1/desc.xml b/python/python3_12_1/desc.xml index f6ddd173f0..8aa6fdea92 100644 --- a/python/python3_12_1/desc.xml +++ b/python/python3_12_1/desc.xml @@ -1,9 +1,9 @@ - ^4.13.1 - CSharp;Java;Python3;JavaScript + ^4.13.2 + CSharp;Java;Python3;JavaScript;TypeScript - CSharp;Java;Python3;JavaScript + CSharp;Java;Python3;JavaScript;TypeScript file_input examples diff --git a/python/python3_12_1/tests/test_double_braces_in_fstring_literal.py b/python/python3_12_1/tests/test_double_braces_in_fstring_literal.py deleted file mode 100644 index ca07eed13d..0000000000 --- a/python/python3_12_1/tests/test_double_braces_in_fstring_literal.py +++ /dev/null @@ -1,9 +0,0 @@ -# COMMAND LINE: -# grun Python file_input -tokens test_double_braces_in_fstring_literal.py -# -# EXPECTATIONS: -# - replace the double braces '{{' or '}}' to single braces: '{' or '}' -# - inserted hidden second brace token (channel=1) -# - no error message - -print(f"{{ {4*10} }}") diff --git a/python/python3_12_1/tests/test_error_first_statement_indented.py b/python/python3_12_1/tests/test_error_first_statement_indented.py index dc70cc8572..39431ac786 100644 --- a/python/python3_12_1/tests/test_error_first_statement_indented.py +++ b/python/python3_12_1/tests/test_error_first_statement_indented.py @@ -4,7 +4,7 @@ # EXPECTATIONS: # - inserted leading INDENT token # - hidden NEWLINE tokens (channel=1) before the first statement -# - lexer error message: "line 10:3 first statement indented" +# - lexer error message: "line 10:3 LEXER ERROR: first statement indented" i = 1 # first statement begins with space diff --git a/python/python3_12_1/tests/test_error_inconsistent_dedent.py b/python/python3_12_1/tests/test_error_inconsistent_dedent.py index 0a74fde76a..660f59ff65 100644 --- a/python/python3_12_1/tests/test_error_inconsistent_dedent.py +++ b/python/python3_12_1/tests/test_error_inconsistent_dedent.py @@ -3,7 +3,7 @@ # # EXPECTATIONS: # - inserted ERROR_TOKEN instead of the DEDENT token -# - lexer error message: "line 10:0 inconsistent dedent" +# - lexer error message: "line 10:0 LEXER ERROR: inconsistent dedent" if True: i = 0 diff --git a/python/python3_12_1/tests/test_error_tab_and_space_in_indentation.py b/python/python3_12_1/tests/test_error_tab_and_space_in_indentation.py index 493933be68..7d77a9bc0e 100644 --- a/python/python3_12_1/tests/test_error_tab_and_space_in_indentation.py +++ b/python/python3_12_1/tests/test_error_tab_and_space_in_indentation.py @@ -3,7 +3,7 @@ # # EXPECTATIONS: # - inserted ERROR_TOKEN instead of the WS token -# - lexer error message: "line 11:0 inconsistent use of tabs and spaces in indentation" +# - lexer error message: "line 11:0 LEXER ERROR: inconsistent use of tabs and spaces in indentation" if True: i = 0 # indented by spaces diff --git a/python/python3_12_1/tests/test_error_unexpected_indent.py b/python/python3_12_1/tests/test_error_unexpected_indent.py index 9d6bbd3f1f..9fca02bf5d 100644 --- a/python/python3_12_1/tests/test_error_unexpected_indent.py +++ b/python/python3_12_1/tests/test_error_unexpected_indent.py @@ -2,7 +2,7 @@ # grun Python file_input -tokens test_error_unexpected_indent.py # # EXPECTATION: -# - parser error message: "line 9:7 extraneous input '' ..." +# - parser error message: "line 9:7 mismatched input '' ..." if True: i = 0 diff --git a/python/python3_12_1/tests/test_explicit_line_joining.py b/python/python3_12_1/tests/test_explicit_line_joining.py index 011ee61e4b..55be1bd964 100644 --- a/python/python3_12_1/tests/test_explicit_line_joining.py +++ b/python/python3_12_1/tests/test_explicit_line_joining.py @@ -2,7 +2,7 @@ # grun Python file_input -tokens test_explicit_line_joining.py # # EXPECTATIONS: -# - hiden (channel=1) LINE_JOINING token +# - hiden (channel=1) EXPLICIT_LINE_JOINING token # - no error message i = 1 \ diff --git a/python/python3_12_1/tests/test_hidden_NEWLINE_before_comment.py b/python/python3_12_1/tests/test_hidden_NEWLINE_before_comment.py index d080bc16fb..9db3798954 100644 --- a/python/python3_12_1/tests/test_hidden_NEWLINE_before_comment.py +++ b/python/python3_12_1/tests/test_hidden_NEWLINE_before_comment.py @@ -6,6 +6,6 @@ def inc(value): # grun Python file_input -tokens test_hidden_NEWLINE_before_comment.py # # EXPECTATIONS: -# - hidden NEWLINE tokens (channel=1) before a COMMENT (or a TYPE_COMMENT) token +# - hidden NEWLINE tokens (channel=1) before a COMMENT token # - hidden NEWLINE token (channel=1) before the blank line # - no error message diff --git a/python/python3_12_1/tests/test_lambda_colon_in_fstring_literal.py b/python/python3_12_1/tests/test_lambda_colon_in_fstring_literal.py deleted file mode 100644 index dd5f492a40..0000000000 --- a/python/python3_12_1/tests/test_lambda_colon_in_fstring_literal.py +++ /dev/null @@ -1,8 +0,0 @@ -# COMMAND LINE: -# grun Python file_input -tokens test_lambda_colon_in_fstring_literal.py -# -# EXPECTATIONS: -# - the colon of the lambda expression is not a start of format specifier in the fstring literal -# - no error message - -print(f"{(lambda x: x*2)(3)}") diff --git a/python/python3_12_1/tests/test_missing format specification_in_fstring_literal.py b/python/python3_12_1/tests/test_missing format specification_in_fstring_literal.py deleted file mode 100644 index 625b3be7e1..0000000000 --- a/python/python3_12_1/tests/test_missing format specification_in_fstring_literal.py +++ /dev/null @@ -1,9 +0,0 @@ -# COMMAND LINE: -# grun Python file_input -tokens test_missing_format_specification_in_fstring_literal.py -# -# EXPECTATIONS: -# - inserted empty FSTRING_MIDDLE token instead of the missing format specification (after the colon) -# - no error message - -print(f"{.070:}") - diff --git a/python/python3_12_1/tests/test_string_literal_with_newline_escape_sequence.py b/python/python3_12_1/tests/test_string_literal_with_newline_escape_sequence.py deleted file mode 100644 index f14d73cb74..0000000000 --- a/python/python3_12_1/tests/test_string_literal_with_newline_escape_sequence.py +++ /dev/null @@ -1,10 +0,0 @@ -# COMMAND LINE: -# grun Python file_input -tokens test_string_literal_with_newline_escape_sequence.py -# -# EXPECTATIONS: -# - removed \ escape sequence from the STRING token -# - inserted hidden token (channel=1) with the original string literal -# - no error message - -s = 'This string will not include \ -backslashes or newline characters.' From cc12b69aed9af76056b0834284e047617a7d1574 Mon Sep 17 00:00:00 2001 From: Robert Einhorn Date: Sun, 15 Sep 2024 20:38:54 +0200 Subject: [PATCH 2/2] added TypeScript target and some modifications --- python/{python3_12_1 => python3_12}/CSharp/PythonLexerBase.cs | 0 python/{python3_12_1 => python3_12}/CSharp/PythonParserBase.cs | 0 python/{python3_12_1 => python3_12}/Java/PythonLexerBase.java | 0 python/{python3_12_1 => python3_12}/Java/PythonParserBase.java | 0 python/{python3_12_1 => python3_12}/JavaScript/PythonLexerBase.js | 0 .../{python3_12_1 => python3_12}/JavaScript/PythonParserBase.js | 0 python/{python3_12_1 => python3_12}/Python3/PythonLexerBase.py | 0 python/{python3_12_1 => python3_12}/Python3/PythonParserBase.py | 0 python/{python3_12_1 => python3_12}/Python3/README.md | 0 python/{python3_12_1 => python3_12}/Python3/transformGrammar.py | 0 .../Python3_12_6_official_grammar.peg | 0 python/{python3_12_1 => python3_12}/PythonLexer.g4 | 0 python/{python3_12_1 => python3_12}/PythonParser.g4 | 0 python/{python3_12_1 => python3_12}/README.md | 0 python/{python3_12_1 => python3_12}/TypeScript/PythonLexerBase.ts | 0 .../{python3_12_1 => python3_12}/TypeScript/PythonParserBase.ts | 0 python/{python3_12_1 => python3_12}/changes.txt | 0 python/{python3_12_1 => python3_12}/desc.xml | 0 python/{python3_12_1 => python3_12}/examples/__future__.py | 0 python/{python3_12_1 => python3_12}/examples/__hello__.py | 0 python/{python3_12_1 => python3_12}/examples/_aix_support.py | 0 python/{python3_12_1 => python3_12}/examples/_collections_abc.py | 0 python/{python3_12_1 => python3_12}/examples/_compat_pickle.py | 0 python/{python3_12_1 => python3_12}/examples/_compression.py | 0 python/{python3_12_1 => python3_12}/examples/_markupbase.py | 0 python/{python3_12_1 => python3_12}/examples/_osx_support.py | 0 python/{python3_12_1 => python3_12}/examples/_py_abc.py | 0 python/{python3_12_1 => python3_12}/examples/_pydatetime.py | 0 python/{python3_12_1 => python3_12}/examples/_pydecimal.py | 0 python/{python3_12_1 => python3_12}/examples/_pyio.py | 0 python/{python3_12_1 => python3_12}/examples/_pylong.py | 0 python/{python3_12_1 => python3_12}/examples/_sitebuiltins.py | 0 python/{python3_12_1 => python3_12}/examples/_strptime.py | 0 python/{python3_12_1 => python3_12}/examples/_threading_local.py | 0 python/{python3_12_1 => python3_12}/examples/_weakrefset.py | 0 python/{python3_12_1 => python3_12}/examples/abc.py | 0 python/{python3_12_1 => python3_12}/examples/aifc.py | 0 python/{python3_12_1 => python3_12}/examples/antigravity.py | 0 python/{python3_12_1 => python3_12}/examples/argparse.py | 0 python/{python3_12_1 => python3_12}/examples/ast.py | 0 python/{python3_12_1 => python3_12}/examples/base64.py | 0 python/{python3_12_1 => python3_12}/examples/bdb.py | 0 python/{python3_12_1 => python3_12}/examples/bisect.py | 0 python/{python3_12_1 => python3_12}/examples/bz2.py | 0 python/{python3_12_1 => python3_12}/examples/calendar.py | 0 python/{python3_12_1 => python3_12}/examples/cgi.py | 0 python/{python3_12_1 => python3_12}/examples/cgitb.py | 0 python/{python3_12_1 => python3_12}/examples/chunk.py | 0 python/{python3_12_1 => python3_12}/pom.xml | 0 python/{python3_12_1 => python3_12}/tests/test_empty_file.py | 0 .../tests/test_error_first_statement_indented.py | 0 .../tests/test_error_inconsistent_dedent.py | 0 .../{python3_12_1 => python3_12}/tests/test_error_not_indented.py | 0 .../tests/test_error_tab_and_space_in_indentation.py | 0 .../tests/test_error_unexpected_indent.py | 0 .../tests/test_explicit_line_joining.py | 0 .../tests/test_formfeed_as_separator.py | 0 .../tests/test_formfeed_at_start_of_line.py | 0 .../{python3_12_1 => python3_12}/tests/test_formfeed_in_indent.py | 0 .../tests/test_hidden_NEWLINE_before_blank_line.py | 0 .../tests/test_hidden_NEWLINE_before_comment.py | 0 .../tests/test_hidden_leading_NEWLINEs.py | 0 .../tests/test_implicit_line_joining.py | 0 .../tests/test_insert_trailing_NEWLINE_1.py | 0 .../tests/test_insert_trailing_NEWLINE_2.py | 0 .../tests/test_no_trailing_NEWLINE.py | 0 .../tests/test_trailing_inconsistent_dedent.py | 0 python/{python3_12_1 => python3_12}/tests/test_trailing_indent.py | 0 .../tests/test_trailing_unexpected_indent.py | 0 69 files changed, 0 insertions(+), 0 deletions(-) rename python/{python3_12_1 => python3_12}/CSharp/PythonLexerBase.cs (100%) rename python/{python3_12_1 => python3_12}/CSharp/PythonParserBase.cs (100%) rename python/{python3_12_1 => python3_12}/Java/PythonLexerBase.java (100%) rename python/{python3_12_1 => python3_12}/Java/PythonParserBase.java (100%) rename python/{python3_12_1 => python3_12}/JavaScript/PythonLexerBase.js (100%) rename python/{python3_12_1 => python3_12}/JavaScript/PythonParserBase.js (100%) rename python/{python3_12_1 => python3_12}/Python3/PythonLexerBase.py (100%) rename python/{python3_12_1 => python3_12}/Python3/PythonParserBase.py (100%) rename python/{python3_12_1 => python3_12}/Python3/README.md (100%) rename python/{python3_12_1 => python3_12}/Python3/transformGrammar.py (100%) rename python/{python3_12_1 => python3_12}/Python3_12_6_official_grammar.peg (100%) rename python/{python3_12_1 => python3_12}/PythonLexer.g4 (100%) rename python/{python3_12_1 => python3_12}/PythonParser.g4 (100%) rename python/{python3_12_1 => python3_12}/README.md (100%) rename python/{python3_12_1 => python3_12}/TypeScript/PythonLexerBase.ts (100%) rename python/{python3_12_1 => python3_12}/TypeScript/PythonParserBase.ts (100%) rename python/{python3_12_1 => python3_12}/changes.txt (100%) rename python/{python3_12_1 => python3_12}/desc.xml (100%) rename python/{python3_12_1 => python3_12}/examples/__future__.py (100%) rename python/{python3_12_1 => python3_12}/examples/__hello__.py (100%) rename python/{python3_12_1 => python3_12}/examples/_aix_support.py (100%) rename python/{python3_12_1 => python3_12}/examples/_collections_abc.py (100%) rename python/{python3_12_1 => python3_12}/examples/_compat_pickle.py (100%) rename python/{python3_12_1 => python3_12}/examples/_compression.py (100%) rename python/{python3_12_1 => python3_12}/examples/_markupbase.py (100%) rename python/{python3_12_1 => python3_12}/examples/_osx_support.py (100%) rename python/{python3_12_1 => python3_12}/examples/_py_abc.py (100%) rename python/{python3_12_1 => python3_12}/examples/_pydatetime.py (100%) rename python/{python3_12_1 => python3_12}/examples/_pydecimal.py (100%) rename python/{python3_12_1 => python3_12}/examples/_pyio.py (100%) rename python/{python3_12_1 => python3_12}/examples/_pylong.py (100%) rename python/{python3_12_1 => python3_12}/examples/_sitebuiltins.py (100%) rename python/{python3_12_1 => python3_12}/examples/_strptime.py (100%) rename python/{python3_12_1 => python3_12}/examples/_threading_local.py (100%) rename python/{python3_12_1 => python3_12}/examples/_weakrefset.py (100%) rename python/{python3_12_1 => python3_12}/examples/abc.py (100%) rename python/{python3_12_1 => python3_12}/examples/aifc.py (100%) rename python/{python3_12_1 => python3_12}/examples/antigravity.py (100%) rename python/{python3_12_1 => python3_12}/examples/argparse.py (100%) rename python/{python3_12_1 => python3_12}/examples/ast.py (100%) rename python/{python3_12_1 => python3_12}/examples/base64.py (100%) rename python/{python3_12_1 => python3_12}/examples/bdb.py (100%) rename python/{python3_12_1 => python3_12}/examples/bisect.py (100%) rename python/{python3_12_1 => python3_12}/examples/bz2.py (100%) rename python/{python3_12_1 => python3_12}/examples/calendar.py (100%) rename python/{python3_12_1 => python3_12}/examples/cgi.py (100%) rename python/{python3_12_1 => python3_12}/examples/cgitb.py (100%) rename python/{python3_12_1 => python3_12}/examples/chunk.py (100%) rename python/{python3_12_1 => python3_12}/pom.xml (100%) rename python/{python3_12_1 => python3_12}/tests/test_empty_file.py (100%) rename python/{python3_12_1 => python3_12}/tests/test_error_first_statement_indented.py (100%) rename python/{python3_12_1 => python3_12}/tests/test_error_inconsistent_dedent.py (100%) rename python/{python3_12_1 => python3_12}/tests/test_error_not_indented.py (100%) rename python/{python3_12_1 => python3_12}/tests/test_error_tab_and_space_in_indentation.py (100%) rename python/{python3_12_1 => python3_12}/tests/test_error_unexpected_indent.py (100%) rename python/{python3_12_1 => python3_12}/tests/test_explicit_line_joining.py (100%) rename python/{python3_12_1 => python3_12}/tests/test_formfeed_as_separator.py (100%) rename python/{python3_12_1 => python3_12}/tests/test_formfeed_at_start_of_line.py (100%) rename python/{python3_12_1 => python3_12}/tests/test_formfeed_in_indent.py (100%) rename python/{python3_12_1 => python3_12}/tests/test_hidden_NEWLINE_before_blank_line.py (100%) rename python/{python3_12_1 => python3_12}/tests/test_hidden_NEWLINE_before_comment.py (100%) rename python/{python3_12_1 => python3_12}/tests/test_hidden_leading_NEWLINEs.py (100%) rename python/{python3_12_1 => python3_12}/tests/test_implicit_line_joining.py (100%) rename python/{python3_12_1 => python3_12}/tests/test_insert_trailing_NEWLINE_1.py (100%) rename python/{python3_12_1 => python3_12}/tests/test_insert_trailing_NEWLINE_2.py (100%) rename python/{python3_12_1 => python3_12}/tests/test_no_trailing_NEWLINE.py (100%) rename python/{python3_12_1 => python3_12}/tests/test_trailing_inconsistent_dedent.py (100%) rename python/{python3_12_1 => python3_12}/tests/test_trailing_indent.py (100%) rename python/{python3_12_1 => python3_12}/tests/test_trailing_unexpected_indent.py (100%) diff --git a/python/python3_12_1/CSharp/PythonLexerBase.cs b/python/python3_12/CSharp/PythonLexerBase.cs similarity index 100% rename from python/python3_12_1/CSharp/PythonLexerBase.cs rename to python/python3_12/CSharp/PythonLexerBase.cs diff --git a/python/python3_12_1/CSharp/PythonParserBase.cs b/python/python3_12/CSharp/PythonParserBase.cs similarity index 100% rename from python/python3_12_1/CSharp/PythonParserBase.cs rename to python/python3_12/CSharp/PythonParserBase.cs diff --git a/python/python3_12_1/Java/PythonLexerBase.java b/python/python3_12/Java/PythonLexerBase.java similarity index 100% rename from python/python3_12_1/Java/PythonLexerBase.java rename to python/python3_12/Java/PythonLexerBase.java diff --git a/python/python3_12_1/Java/PythonParserBase.java b/python/python3_12/Java/PythonParserBase.java similarity index 100% rename from python/python3_12_1/Java/PythonParserBase.java rename to python/python3_12/Java/PythonParserBase.java diff --git a/python/python3_12_1/JavaScript/PythonLexerBase.js b/python/python3_12/JavaScript/PythonLexerBase.js similarity index 100% rename from python/python3_12_1/JavaScript/PythonLexerBase.js rename to python/python3_12/JavaScript/PythonLexerBase.js diff --git a/python/python3_12_1/JavaScript/PythonParserBase.js b/python/python3_12/JavaScript/PythonParserBase.js similarity index 100% rename from python/python3_12_1/JavaScript/PythonParserBase.js rename to python/python3_12/JavaScript/PythonParserBase.js diff --git a/python/python3_12_1/Python3/PythonLexerBase.py b/python/python3_12/Python3/PythonLexerBase.py similarity index 100% rename from python/python3_12_1/Python3/PythonLexerBase.py rename to python/python3_12/Python3/PythonLexerBase.py diff --git a/python/python3_12_1/Python3/PythonParserBase.py b/python/python3_12/Python3/PythonParserBase.py similarity index 100% rename from python/python3_12_1/Python3/PythonParserBase.py rename to python/python3_12/Python3/PythonParserBase.py diff --git a/python/python3_12_1/Python3/README.md b/python/python3_12/Python3/README.md similarity index 100% rename from python/python3_12_1/Python3/README.md rename to python/python3_12/Python3/README.md diff --git a/python/python3_12_1/Python3/transformGrammar.py b/python/python3_12/Python3/transformGrammar.py similarity index 100% rename from python/python3_12_1/Python3/transformGrammar.py rename to python/python3_12/Python3/transformGrammar.py diff --git a/python/python3_12_1/Python3_12_6_official_grammar.peg b/python/python3_12/Python3_12_6_official_grammar.peg similarity index 100% rename from python/python3_12_1/Python3_12_6_official_grammar.peg rename to python/python3_12/Python3_12_6_official_grammar.peg diff --git a/python/python3_12_1/PythonLexer.g4 b/python/python3_12/PythonLexer.g4 similarity index 100% rename from python/python3_12_1/PythonLexer.g4 rename to python/python3_12/PythonLexer.g4 diff --git a/python/python3_12_1/PythonParser.g4 b/python/python3_12/PythonParser.g4 similarity index 100% rename from python/python3_12_1/PythonParser.g4 rename to python/python3_12/PythonParser.g4 diff --git a/python/python3_12_1/README.md b/python/python3_12/README.md similarity index 100% rename from python/python3_12_1/README.md rename to python/python3_12/README.md diff --git a/python/python3_12_1/TypeScript/PythonLexerBase.ts b/python/python3_12/TypeScript/PythonLexerBase.ts similarity index 100% rename from python/python3_12_1/TypeScript/PythonLexerBase.ts rename to python/python3_12/TypeScript/PythonLexerBase.ts diff --git a/python/python3_12_1/TypeScript/PythonParserBase.ts b/python/python3_12/TypeScript/PythonParserBase.ts similarity index 100% rename from python/python3_12_1/TypeScript/PythonParserBase.ts rename to python/python3_12/TypeScript/PythonParserBase.ts diff --git a/python/python3_12_1/changes.txt b/python/python3_12/changes.txt similarity index 100% rename from python/python3_12_1/changes.txt rename to python/python3_12/changes.txt diff --git a/python/python3_12_1/desc.xml b/python/python3_12/desc.xml similarity index 100% rename from python/python3_12_1/desc.xml rename to python/python3_12/desc.xml diff --git a/python/python3_12_1/examples/__future__.py b/python/python3_12/examples/__future__.py similarity index 100% rename from python/python3_12_1/examples/__future__.py rename to python/python3_12/examples/__future__.py diff --git a/python/python3_12_1/examples/__hello__.py b/python/python3_12/examples/__hello__.py similarity index 100% rename from python/python3_12_1/examples/__hello__.py rename to python/python3_12/examples/__hello__.py diff --git a/python/python3_12_1/examples/_aix_support.py b/python/python3_12/examples/_aix_support.py similarity index 100% rename from python/python3_12_1/examples/_aix_support.py rename to python/python3_12/examples/_aix_support.py diff --git a/python/python3_12_1/examples/_collections_abc.py b/python/python3_12/examples/_collections_abc.py similarity index 100% rename from python/python3_12_1/examples/_collections_abc.py rename to python/python3_12/examples/_collections_abc.py diff --git a/python/python3_12_1/examples/_compat_pickle.py b/python/python3_12/examples/_compat_pickle.py similarity index 100% rename from python/python3_12_1/examples/_compat_pickle.py rename to python/python3_12/examples/_compat_pickle.py diff --git a/python/python3_12_1/examples/_compression.py b/python/python3_12/examples/_compression.py similarity index 100% rename from python/python3_12_1/examples/_compression.py rename to python/python3_12/examples/_compression.py diff --git a/python/python3_12_1/examples/_markupbase.py b/python/python3_12/examples/_markupbase.py similarity index 100% rename from python/python3_12_1/examples/_markupbase.py rename to python/python3_12/examples/_markupbase.py diff --git a/python/python3_12_1/examples/_osx_support.py b/python/python3_12/examples/_osx_support.py similarity index 100% rename from python/python3_12_1/examples/_osx_support.py rename to python/python3_12/examples/_osx_support.py diff --git a/python/python3_12_1/examples/_py_abc.py b/python/python3_12/examples/_py_abc.py similarity index 100% rename from python/python3_12_1/examples/_py_abc.py rename to python/python3_12/examples/_py_abc.py diff --git a/python/python3_12_1/examples/_pydatetime.py b/python/python3_12/examples/_pydatetime.py similarity index 100% rename from python/python3_12_1/examples/_pydatetime.py rename to python/python3_12/examples/_pydatetime.py diff --git a/python/python3_12_1/examples/_pydecimal.py b/python/python3_12/examples/_pydecimal.py similarity index 100% rename from python/python3_12_1/examples/_pydecimal.py rename to python/python3_12/examples/_pydecimal.py diff --git a/python/python3_12_1/examples/_pyio.py b/python/python3_12/examples/_pyio.py similarity index 100% rename from python/python3_12_1/examples/_pyio.py rename to python/python3_12/examples/_pyio.py diff --git a/python/python3_12_1/examples/_pylong.py b/python/python3_12/examples/_pylong.py similarity index 100% rename from python/python3_12_1/examples/_pylong.py rename to python/python3_12/examples/_pylong.py diff --git a/python/python3_12_1/examples/_sitebuiltins.py b/python/python3_12/examples/_sitebuiltins.py similarity index 100% rename from python/python3_12_1/examples/_sitebuiltins.py rename to python/python3_12/examples/_sitebuiltins.py diff --git a/python/python3_12_1/examples/_strptime.py b/python/python3_12/examples/_strptime.py similarity index 100% rename from python/python3_12_1/examples/_strptime.py rename to python/python3_12/examples/_strptime.py diff --git a/python/python3_12_1/examples/_threading_local.py b/python/python3_12/examples/_threading_local.py similarity index 100% rename from python/python3_12_1/examples/_threading_local.py rename to python/python3_12/examples/_threading_local.py diff --git a/python/python3_12_1/examples/_weakrefset.py b/python/python3_12/examples/_weakrefset.py similarity index 100% rename from python/python3_12_1/examples/_weakrefset.py rename to python/python3_12/examples/_weakrefset.py diff --git a/python/python3_12_1/examples/abc.py b/python/python3_12/examples/abc.py similarity index 100% rename from python/python3_12_1/examples/abc.py rename to python/python3_12/examples/abc.py diff --git a/python/python3_12_1/examples/aifc.py b/python/python3_12/examples/aifc.py similarity index 100% rename from python/python3_12_1/examples/aifc.py rename to python/python3_12/examples/aifc.py diff --git a/python/python3_12_1/examples/antigravity.py b/python/python3_12/examples/antigravity.py similarity index 100% rename from python/python3_12_1/examples/antigravity.py rename to python/python3_12/examples/antigravity.py diff --git a/python/python3_12_1/examples/argparse.py b/python/python3_12/examples/argparse.py similarity index 100% rename from python/python3_12_1/examples/argparse.py rename to python/python3_12/examples/argparse.py diff --git a/python/python3_12_1/examples/ast.py b/python/python3_12/examples/ast.py similarity index 100% rename from python/python3_12_1/examples/ast.py rename to python/python3_12/examples/ast.py diff --git a/python/python3_12_1/examples/base64.py b/python/python3_12/examples/base64.py similarity index 100% rename from python/python3_12_1/examples/base64.py rename to python/python3_12/examples/base64.py diff --git a/python/python3_12_1/examples/bdb.py b/python/python3_12/examples/bdb.py similarity index 100% rename from python/python3_12_1/examples/bdb.py rename to python/python3_12/examples/bdb.py diff --git a/python/python3_12_1/examples/bisect.py b/python/python3_12/examples/bisect.py similarity index 100% rename from python/python3_12_1/examples/bisect.py rename to python/python3_12/examples/bisect.py diff --git a/python/python3_12_1/examples/bz2.py b/python/python3_12/examples/bz2.py similarity index 100% rename from python/python3_12_1/examples/bz2.py rename to python/python3_12/examples/bz2.py diff --git a/python/python3_12_1/examples/calendar.py b/python/python3_12/examples/calendar.py similarity index 100% rename from python/python3_12_1/examples/calendar.py rename to python/python3_12/examples/calendar.py diff --git a/python/python3_12_1/examples/cgi.py b/python/python3_12/examples/cgi.py similarity index 100% rename from python/python3_12_1/examples/cgi.py rename to python/python3_12/examples/cgi.py diff --git a/python/python3_12_1/examples/cgitb.py b/python/python3_12/examples/cgitb.py similarity index 100% rename from python/python3_12_1/examples/cgitb.py rename to python/python3_12/examples/cgitb.py diff --git a/python/python3_12_1/examples/chunk.py b/python/python3_12/examples/chunk.py similarity index 100% rename from python/python3_12_1/examples/chunk.py rename to python/python3_12/examples/chunk.py diff --git a/python/python3_12_1/pom.xml b/python/python3_12/pom.xml similarity index 100% rename from python/python3_12_1/pom.xml rename to python/python3_12/pom.xml diff --git a/python/python3_12_1/tests/test_empty_file.py b/python/python3_12/tests/test_empty_file.py similarity index 100% rename from python/python3_12_1/tests/test_empty_file.py rename to python/python3_12/tests/test_empty_file.py diff --git a/python/python3_12_1/tests/test_error_first_statement_indented.py b/python/python3_12/tests/test_error_first_statement_indented.py similarity index 100% rename from python/python3_12_1/tests/test_error_first_statement_indented.py rename to python/python3_12/tests/test_error_first_statement_indented.py diff --git a/python/python3_12_1/tests/test_error_inconsistent_dedent.py b/python/python3_12/tests/test_error_inconsistent_dedent.py similarity index 100% rename from python/python3_12_1/tests/test_error_inconsistent_dedent.py rename to python/python3_12/tests/test_error_inconsistent_dedent.py diff --git a/python/python3_12_1/tests/test_error_not_indented.py b/python/python3_12/tests/test_error_not_indented.py similarity index 100% rename from python/python3_12_1/tests/test_error_not_indented.py rename to python/python3_12/tests/test_error_not_indented.py diff --git a/python/python3_12_1/tests/test_error_tab_and_space_in_indentation.py b/python/python3_12/tests/test_error_tab_and_space_in_indentation.py similarity index 100% rename from python/python3_12_1/tests/test_error_tab_and_space_in_indentation.py rename to python/python3_12/tests/test_error_tab_and_space_in_indentation.py diff --git a/python/python3_12_1/tests/test_error_unexpected_indent.py b/python/python3_12/tests/test_error_unexpected_indent.py similarity index 100% rename from python/python3_12_1/tests/test_error_unexpected_indent.py rename to python/python3_12/tests/test_error_unexpected_indent.py diff --git a/python/python3_12_1/tests/test_explicit_line_joining.py b/python/python3_12/tests/test_explicit_line_joining.py similarity index 100% rename from python/python3_12_1/tests/test_explicit_line_joining.py rename to python/python3_12/tests/test_explicit_line_joining.py diff --git a/python/python3_12_1/tests/test_formfeed_as_separator.py b/python/python3_12/tests/test_formfeed_as_separator.py similarity index 100% rename from python/python3_12_1/tests/test_formfeed_as_separator.py rename to python/python3_12/tests/test_formfeed_as_separator.py diff --git a/python/python3_12_1/tests/test_formfeed_at_start_of_line.py b/python/python3_12/tests/test_formfeed_at_start_of_line.py similarity index 100% rename from python/python3_12_1/tests/test_formfeed_at_start_of_line.py rename to python/python3_12/tests/test_formfeed_at_start_of_line.py diff --git a/python/python3_12_1/tests/test_formfeed_in_indent.py b/python/python3_12/tests/test_formfeed_in_indent.py similarity index 100% rename from python/python3_12_1/tests/test_formfeed_in_indent.py rename to python/python3_12/tests/test_formfeed_in_indent.py diff --git a/python/python3_12_1/tests/test_hidden_NEWLINE_before_blank_line.py b/python/python3_12/tests/test_hidden_NEWLINE_before_blank_line.py similarity index 100% rename from python/python3_12_1/tests/test_hidden_NEWLINE_before_blank_line.py rename to python/python3_12/tests/test_hidden_NEWLINE_before_blank_line.py diff --git a/python/python3_12_1/tests/test_hidden_NEWLINE_before_comment.py b/python/python3_12/tests/test_hidden_NEWLINE_before_comment.py similarity index 100% rename from python/python3_12_1/tests/test_hidden_NEWLINE_before_comment.py rename to python/python3_12/tests/test_hidden_NEWLINE_before_comment.py diff --git a/python/python3_12_1/tests/test_hidden_leading_NEWLINEs.py b/python/python3_12/tests/test_hidden_leading_NEWLINEs.py similarity index 100% rename from python/python3_12_1/tests/test_hidden_leading_NEWLINEs.py rename to python/python3_12/tests/test_hidden_leading_NEWLINEs.py diff --git a/python/python3_12_1/tests/test_implicit_line_joining.py b/python/python3_12/tests/test_implicit_line_joining.py similarity index 100% rename from python/python3_12_1/tests/test_implicit_line_joining.py rename to python/python3_12/tests/test_implicit_line_joining.py diff --git a/python/python3_12_1/tests/test_insert_trailing_NEWLINE_1.py b/python/python3_12/tests/test_insert_trailing_NEWLINE_1.py similarity index 100% rename from python/python3_12_1/tests/test_insert_trailing_NEWLINE_1.py rename to python/python3_12/tests/test_insert_trailing_NEWLINE_1.py diff --git a/python/python3_12_1/tests/test_insert_trailing_NEWLINE_2.py b/python/python3_12/tests/test_insert_trailing_NEWLINE_2.py similarity index 100% rename from python/python3_12_1/tests/test_insert_trailing_NEWLINE_2.py rename to python/python3_12/tests/test_insert_trailing_NEWLINE_2.py diff --git a/python/python3_12_1/tests/test_no_trailing_NEWLINE.py b/python/python3_12/tests/test_no_trailing_NEWLINE.py similarity index 100% rename from python/python3_12_1/tests/test_no_trailing_NEWLINE.py rename to python/python3_12/tests/test_no_trailing_NEWLINE.py diff --git a/python/python3_12_1/tests/test_trailing_inconsistent_dedent.py b/python/python3_12/tests/test_trailing_inconsistent_dedent.py similarity index 100% rename from python/python3_12_1/tests/test_trailing_inconsistent_dedent.py rename to python/python3_12/tests/test_trailing_inconsistent_dedent.py diff --git a/python/python3_12_1/tests/test_trailing_indent.py b/python/python3_12/tests/test_trailing_indent.py similarity index 100% rename from python/python3_12_1/tests/test_trailing_indent.py rename to python/python3_12/tests/test_trailing_indent.py diff --git a/python/python3_12_1/tests/test_trailing_unexpected_indent.py b/python/python3_12/tests/test_trailing_unexpected_indent.py similarity index 100% rename from python/python3_12_1/tests/test_trailing_unexpected_indent.py rename to python/python3_12/tests/test_trailing_unexpected_indent.py