Sunday, November 26, 2006

Multiline strings in java - Part 2

After a little more messing around with Scanner.java in the
compiler source code, I now have the string handling similar to
pythons. """ denotes a multiline string, the code normalizes
end of line characters (CRNL -> NL) (CR -> NL) (a bit like XML
handling), and raw strings are added. r" or r""" denotes a raw string
with unicode escape handling (equilvant to pythons ur", ur""")
and R" or R""" denotes a raw string without unicode handling.

Raw strings are usefull for regular expressions and
for evalulating script strings.

The grammar is something like: (copied in part from python grammar)

stringliteral ::= [stringprefix](shortstring | longstring)
stringprefix ::= "r" | "R"
shortstring ::= '"' shortstringitem* '"'
longstring ::= '"""' longstringitem* '"""'
shortstringitem ::= shortstringchar | escapeseq
longstringitem ::= longstringchar | escapeseq
shortstringchar ::=
longstringchar ::=
escapeseq ::= "\"

So the following now compiles and runs:

import javax.script.*;

public class ScriptExample {
public static void main(String[] args) throws Exception {
ScriptEngineManager m = new ScriptEngineManager();
ScriptEngine engine = m.getEngineByName("javascript");
engine.eval(r"""
print("This is the world ");
print("calling\n");
""");
String x = R"\u";
System.out.println("No escape " + x);
}
}

/f/src/compiler > java -jar dist/lib/javac.jar ScriptExample.java
/f/src/compiler > java -cp . ScriptExample
This is the world calling
No escape \u


The code is *NOT* fully tested .... but
here is the diff file (-Nua) for Scanner.java to Scanner.java.orig

--- Scanner.java.orig Sun Nov 26 12:57:25 2006
+++ Scanner.java Sun Nov 26 21:49:59 2006
@@ -52,6 +52,13 @@
@Version("@(#)Scanner.java 1.73 06/11/11")
public class Scanner implements Lexer {

+ /** Enumerated type on how to handle decoding \ in string */
+ private enum EscapeHandling {
+ FULL, // decode all \
+ UNICODE, // only decode unicode escape
+ RAW, // completely raw
+ }
+
private static boolean scannerDebug = false;

/** A factory for creating scanners. */
@@ -291,6 +298,101 @@
convertUnicode();
}
}
+ /** Read next character.
+ * @param escapeHandling how to handle \ in strings
+ */
+ private void scanChar(EscapeHandling escapeHandling) {
+ ch = buf[++bp];
+ if (escapeHandling != EscapeHandling.RAW && ch == '\\') {
+ convertUnicode();
+ }
+ }
+
+ /**
+ * Look ahead for a character.
+ * @param count the number to look ahead by.
+ * @return the character of the future... or (char) 0 for
+ * characters beyond the buffer.
+ */
+ private char peekAhead(int count) {
+ if (count + bp >= buflen) {
+ return (char) 0;
+ }
+ return buf[count + bp];
+ }
+
+ /** Check if we are at a triple double quote.
+ * @return true if the next two characters are quotes.
+ */
+ private boolean atTripleQuote() {
+ return peekAhead(1) == '"' && peekAhead(2) == '"';
+ }
+
+ /** Scan a triple quoted string.
+ * @param escapeHandling how to handle \ in strings
+ */
+ private void scanTripleQuotedString(EscapeHandling escapeHandling) {
+ // read in the """ characters
+ scanChar();
+ scanChar();
+ scanChar(escapeHandling);
+ while (true) {
+ while (ch != '"' && ch != CR && bp < buflen) {
+ scanLitChar(escapeHandling);
+ }
+ if (bp >= buflen) {
+ lexError(pos, "unclosed.str.lit");
+ return;
+ }
+ if (ch == CR) {
+ if (peekAhead(1) == LF) {
+ scanChar();
+ continue; // Normalize CRLF to LF
+ }
+ ch = LF; // Normalize CR to LF
+ continue;
+ }
+ // Character is a ", is the the triple?
+ if (atTripleQuote()) {
+ // yes - end of the string
+ scanChar();
+ scanChar();
+ scanChar();
+ token = STRINGLITERAL;
+ return;
+ } else {
+ // no - add it to the string.
+ scanLitChar(escapeHandling);
+ }
+ }
+ }
+
+ /** Scan a quoted string.
+ * @param escapeHandling how to handle \ in strings
+ */
+ private void scanQuotedString(EscapeHandling escapeHandling) {
+ scanChar(escapeHandling);
+ while (ch != '"' && ch != CR && ch != LF && bp < buflen) {
+ scanLitChar(escapeHandling);
+ }
+ if (ch == '"') {
+ token = STRINGLITERAL;
+ scanChar();
+ } else {
+ lexError(pos, "unclosed.str.lit");
+ }
+ }
+
+ /** Scan a string.
+ * @param escapeHandling how to handle \ in strings
+ */
+ private void scanString(EscapeHandling escapeHandling) {
+ if (atTripleQuote()) {
+ scanTripleQuotedString(escapeHandling);
+ } else {
+ scanQuotedString(escapeHandling);
+ }
+ }

/** Read next character in comment, skipping over double '\' characters.
*/
@@ -323,9 +425,10 @@
}

/** Read next character in character or string literal and copy into sbuf.
+ * @param escapeHandling how to handle \ in strings
*/
- private void scanLitChar() {
- if (ch == '\\') {
+ private void scanLitChar(EscapeHandling escapeHandling) {
+ if (escapeHandling == EscapeHandling.FULL && ch == '\\') {
if (buf[bp+1] == '\\' && unicodeConversionBp != bp) {
bp++;
putChar('\\');
@@ -369,7 +472,8 @@
}
}
} else if (bp != buflen) {
- putChar(ch); scanChar();
+ putChar(ch);
+ scanChar(escapeHandling);
}
}

@@ -779,16 +883,30 @@
endPos = bp;
processLineTerminator();
break;
+ case 'r':
+ if ('"' == peekAhead(1)) {
+ scanChar();
+ scanString(EscapeHandling.UNICODE);
+ return;
+ }
+ // Fall Tru
+ case 'R':
+ if ('"' == peekAhead(1)) {
+ scanChar();
+ scanString(EscapeHandling.RAW);
+ return;
+ }
+ // FALL TRU
case 'A': case 'B': case 'C': case 'D': case 'E':
case 'F': case 'G': case 'H': case 'I': case 'J':
case 'K': case 'L': case 'M': case 'N': case 'O':
- case 'P': case 'Q': case 'R': case 'S': case 'T':
+ case 'P': case 'Q': case 'S': case 'T':
case 'U': case 'V': case 'W': case 'X': case 'Y':
case 'Z':
case 'a': case 'b': case 'c': case 'd': case 'e':
case 'f': case 'g': case 'h': case 'i': case 'j':
case 'k': case 'l': case 'm': case 'n': case 'o':
- case 'p': case 'q': case 'r': case 's': case 't':
+ case 'p': case 'q': case 's': case 't':
case 'u': case 'v': case 'w': case 'x': case 'y':
case 'z':
case '$': case '_':
@@ -902,7 +1020,7 @@
} else {
if (ch == CR || ch == LF)
lexError(pos, "illegal.line.end.in.char.lit");
- scanLitChar();
+ scanLitChar(EscapeHandling.FULL);
if (ch == '\'') {
scanChar();
token = CHARLITERAL;
@@ -911,17 +1029,9 @@
}
}
return;
- case '\"':
- scanChar();
- while (ch != '\"' && ch != CR && ch != LF && bp < buflen)
- scanLitChar();
- if (ch == '\"') {
- token = STRINGLITERAL;
- scanChar();
- } else {
- lexError(pos, "unclosed.str.lit");
- }
- return;
+ case '"':
+ scanString(EscapeHandling.FULL);
+ return;
default:
if (isSpecial(ch)) {
scanOperator();

No comments: