Re: getting the thing to be LL(1)
- From: Rick Barton <rbarton@xxxxxxxxxxxxx>
- Date: Thu, 27 Oct 2005 05:42:26 GMT
I don't have your answer, but... did you know there is both global and localized LOOKAHEAD? Keep the global setting as low as possible, and fine-tune the trouble spots.
Robert Watkins wrote:
I have been wrangling with this one for a few days now, and can't for the life of me find the right solution -- if indeed there is one! Any guidance would be appreciated.
I am attempting to write a query parser and to keep it LL(1). The main problem I'm having is with the NEAR operator: it needs to be able to accept prenthetic expressions such as
aaa NEAR (bbb PHRASE ccc*)
but it cannot accept any operands that are not simple terms (i.e. WORD) or PHRASE expressions, and any field specifications must be the same. For example, the following are not allowed:
author:(a next b*) near author:(c and body:d) author:fred near author:(a and b) title:wine near keyword:tobacco
The following grammar works, but requires a LOOKAHEAD(3), which I would like to avoid if at all possible.
I have tried dividing fieldExpr() into fieldPhraseExpr() ( field = field() )? ( phraseExpr() | <LPAREN> phraseExpr() <RPAREN> ) and fieldOtherExpr() ( field = field() )? ( phraseExpr() | <LPAREN> expr() <RPAREN> ) and using only fieldPhraseExpr() in nearExpr() but then unaryExpr() becomes qref() { qRefCount++; } | nearExpr() | fieldOtherExpr() and requires a lookahead greater than 3 (I think I got it down to 4).
There are other variations that I've tried, but this posting is rather long anyway, so I'll not enumerate. I'm probably missing something basic and cannot now see the forest for the proverbial trees.
Thank you to anyone taking the time to look at this, -- Robert
--------------------------- Robert Watkins rwatkins at foo-bar dot org ---------------------------
--------------------------------------
// QueryParser.jjt // $Id$
options { //DEBUG_LOOKAHEAD = true; //DEBUG_TOKEN_MANAGER = true; NODE_DEFAULT_VOID = true; NODE_PREFIX = ""; STATIC = false; USER_CHAR_STREAM = false; USER_TOKEN_MANAGER = false; }
PARSER_BEGIN(QueryParser)
package com.wiley.interscience.search.queryparser;
public class QueryParser {
private int nearNumDefault = 3; // gota have some default private int qRefCount = 0;
public QueryParser(java.io.Reader stream, int nearNumDefault) { this(stream); this.nearNumDefault = nearNumDefault; }
public int getNearNumDefault() { return nearNumDefault; }
protected int qRefCount() { return qRefCount; }
}
PARSER_END(QueryParser)
/* Token definitions */
<*> TOKEN : { <#_WHITESPACE: ( " " | "\t" ) >
/* --- unicode letters --- // requires UNICODE_INPUT = true; in options
// ASCII letters
"\u0041"-"\u005a", // A-Z
"\u0061"-"\u007a", // a-z
// Latin-1 letters (w/out multiplication "\u00d7" or division "\u00f7" symbols)
"\u00c0"-"\u00d6", // À-Ö
"\u00d8"-"\u00f6", // Ø-ö
"\u00f8"-"\u00ff", // ø-ÿ
// Latin Extended-A and Latin Extended-B
"\u0100"-"\u1fff"
*/
| <#_LETTER: ["A"-"Z", "a"-"z", "À"-"Ö", "Ø"-"ö", "ø"-"ÿ"] >
| <#_ALPHA: (<_LETTER>)+ >
| <#_DIGIT: ["0"-"9"] >
| <#_NUM_GT_ZERO: ["1"-"9"] (<_DIGIT>)* >
| <#_HAS_DIGIT: (<_LETTER>|<_DIGIT>)* <_DIGIT> (<_LETTER>|<_DIGIT>)* > // at least one digit
| <#_P: ("_"|"-"|"/"|"."|",") >
// basic word: a sequence of digits & letters | <#_ALPHANUM: (<_LETTER>|<_DIGIT>)+ >
// internal apostrophes: O'Reilly, you're, O'Reilly's // use a post-filter to remove possesives | <#_APOSTROPHE: <_ALPHA> ("'" <_ALPHA>)+ >
// acronyms: U.S.A., I.B.M., etc.
// use a post-filter to remove dots
| <#_ACRONYM: <_ALPHA> "." (<_ALPHA> ".")+ >
// company names like AT&T and Excite@Home. | <#_COMPANY: <_ALPHA> ("&"|"@") <_ALPHA> >
// email addresses
| <#_EMAIL: <_ALPHANUM> (("."|"-"|"_") <_ALPHANUM>)* "@" <_ALPHANUM> (("."|"-") <_ALPHANUM>)+ >
// hostname | <#_HOST: <_ALPHANUM> ("." <_ALPHANUM>)+ >
// floating point, serial, model numbers, ip addresses, etc.
// every other segment must have at least one digit
| <#_NUM: (
<_ALPHANUM> <_P> <_HAS_DIGIT>
| <_HAS_DIGIT> <_P> <_ALPHANUM>
| <_ALPHANUM> ( <_P> <_HAS_DIGIT> <_P> <_ALPHANUM> )+
| <_HAS_DIGIT> ( <_P> <_ALPHANUM> <_P> <_HAS_DIGIT> )+
| <_ALPHANUM> <_P> <_HAS_DIGIT> ( <_P> <_ALPHANUM> <_P> <_HAS_DIGIT> )+
| <_HAS_DIGIT> <_P> <_ALPHANUM> ( <_P> <_HAS_DIGIT> <_P> <_ALPHANUM> )+
) >
| <#_QUOTED: "\"" (~["\""])+ "\"" >
| <#_ESCAPED_CHAR: "\\" [ "\\", "+", "-", "!", "(", ")", ":", "^", "[", "]", "\"", "{", "}", "~", "*", "?" ] > | <#_TERM_START_CHAR: ( ~[ " ", "\t", "+", "-", "!", "(", ")", ":", "^", "[", "]", "\"", "{", "}", "~", "*", "?" ] | <_ESCAPED_CHAR> ) >
| <#_TERM_CHAR: ( <_TERM_START_CHAR> | <_ESCAPED_CHAR> ) >
| <#_TERM: <_TERM_START_CHAR> (<_TERM_CHAR>)* >
| <#_PREFIXTERM: <_TERM_START_CHAR> (<_TERM_CHAR>)* "*" >
| <#_WILDTERM: <_TERM_START_CHAR> (<_TERM_CHAR> | ( [ "*", "?" ] ))*
| <#_FIELD: (<_ALPHANUM>)+ ("-"(<_ALPHANUM>)+)* ":" > // article-title:
}
SKIP : { <<_WHITESPACE>> }
TOKEN : { <LPAREN: "(" > | <RPAREN: ")" > }
TOKEN [IGNORE_CASE] : { <AND: "AND" > | <OR: "OR" > | <NOT: "NOT" > | <NEAR: "NEAR" ( ("/")? <_NUM_GT_ZERO> )? > | <PHRASE: ( "PHRASE" | "NEXT" | ( "NEAR" ("/")? "0" ) ) > }
TOKEN : { <QREF: "#" <_NUM_GT_ZERO> > }
TOKEN : { <WORD: ( <_ALPHANUM> | <_APOSTROPHE> | <_ACRONYM> | <_COMPANY> | <_EMAIL> | <_HOST> | <_NUM> | <_QUOTED> | <_TERM> | <_PREFIXTERM> | <_WILDTERM> )+ > }
TOKEN : { <FIELD: (<_ALPHANUM>)+ ("-"(<_ALPHANUM>)+)* ":" > // article-title: }
/* actions */
SimpleNode parse() #ROOT : {} { expr() <EOF> { return jjtThis; } }
void expr() : {} { orExpr() }
void orExpr() : {} { ( andExpr() ( <OR> andExpr() )* { jjtThis.setName("OR"); } ) #OR(>1) }
void andExpr() : {} { ( unaryExpr() ( ( <AND> )? ( unaryExpr() | notExpr() ) )* { jjtThis.setName("AND"); } ) #AND(>1) }
void unaryExpr() : {} { qref() { qRefCount++; } | nearExpr() }
void notExpr() : {} { ( <NOT> unaryExpr() { jjtThis.setName("NOT"); } ) #NOT }
void nearExpr() :
{
String nearOp = null;
//String fieldA = "";
//String fieldB = "";
String[] fpA;
String[] fpB;
}
{
(
fpA = fieldExpr() (
nearOp = nearOp() fpB = fieldExpr()
{
if (!fpB[0].equals(fpA[0])) {
throw new ParseException("NEAR expression requires fields to be identical");
}
else if (!(fpA[1].equals("true") && fpB[1].equals("true"))) {
throw new ParseException("NEAR expression requires all operands to be PHRASE expressions");
}
}
)*
{
if (nearOp != null) { jjtThis.setName(nearOp); }
}
) #NEAR(>1)
}
String[] fieldExpr() : { String field = ""; String phrase = "false"; String[] fp = new String[2]; } { ( //( field = field() )? ( phraseExpr() | <LPAREN> expr() <RPAREN> ) ( field = field() )? ( phraseExpr() { phrase = "true"; } | LOOKAHEAD(3) <LPAREN> phraseExpr() <RPAREN> { phrase = "true"; } | <LPAREN> expr() <RPAREN> ) { fp[0] = field; fp[1] = phrase; return fp; //return field; } ) }
void phraseExpr() : { String phraseOp = null; } { ( word() ( phraseOp = phraseOp() word() )* { if (phraseOp != null) { jjtThis.setName(phraseOp); } } ) #PHRASE(>1) }
String nearOp() :
{ Token t; }
{
( t = <NEAR> )
{
StringBuffer tokenBuf = new StringBuffer();
if (t.image.equals("NEAR") || t.image.equals("near")) {
tokenBuf.append("NEAR/").append(nearNumDefault);
t.image = tokenBuf.toString();
}
else {
try {
int nearNumPos = (t.image.indexOf("/") == 4) ? 5 : 4; // after "NEAR/" or after "NEAR"
String nearNum = t.image.substring(nearNumPos);
int nearInt = Integer.parseInt(nearNum);
tokenBuf.append("NEAR/").append(nearNum);
t.image = tokenBuf.toString();
}
catch (NumberFormatException e) {
// can only happen if nearInt is too large to besome an int
throw new ParseException(token.image + " is not a proper NEAR modifier");
}
}
return t.image;
}
}
String phraseOp() :
{ Token t; }
{
( t = <PHRASE> )
{
if (t.image.equals("NEXT") || t.image.equals("next")) {
t.image = "PHRASE";
}
else if (t.image.equals("PHRASE")) {
// do nothing: all is well
}
else {
try {
int nextNumPos = (t.image.indexOf("/") == 4) ? 5 : 4; // after "next/" or after "next"
String nextNum = t.image.substring(nextNumPos);
int nextInt = Integer.parseInt(nextNum);
if (nextInt == 0) {
t.image = "PHRASE";
}
else {
throw new ParseException(nextNum + " is not a proper PHRASE modifier");;
}
}
catch (NumberFormatException e) {
// can only happen if nextInt is too large to besome an int
throw new ParseException(token.image + " is not a proper PHRASE modifier");
}
}
return t.image;
}
}
void qref() #QREF : { Token t; } { ( t = <QREF> ) { jjtThis.setName(t.image); } }
void word() #WORD : { Token t; } { ( t=<WORD> ) { jjtThis.setName(t.image); } }
String field() #FIELD : { Token t; } { ( t=<FIELD> ) { jjtThis.setName(t.image); return t.image.substring(0, (t.image.length() - 1)); } }
.
- References:
- getting the thing to be LL(1)
- From: Robert Watkins
- getting the thing to be LL(1)
- Prev by Date: Re: Parsing mulitple files
- Previous by thread: getting the thing to be LL(1)
- Next by thread: for loop Parsing
- Index(es):
Relevant Pages
|
Loading