antlr-代码分析(2)-Lexer&IntStream

Recognizer中的疑惑

在阅读Recognizer时,出现了很多新的概念
ATNInterpreter
ANTLRErrorListener
ANT
ParseInfo

Lexer

同时实现了Recognizer接口和TokenSource接口,提供了一系列的tokens源
TokenSource接口
代码如下

public interface TokenSource {
	//从输入流返回token对象
	public Token nextToken();
	//输入流现在的行号
	public int getLine();
	//现在行所在位置
	public int getCharPositionInLine();
	//获取CharStream,获取tokens
	public CharStream getInputStream();
	//返回输入源名字
	public String getSourceName();
	//设置TokenFactory
	public void setTokenFactory(TokenFactory<?> factory);
	//获得TokenFactory,用来从输入流构造对象
	public TokenFactory<?> getTokenFactory();
}

Lexer代码如下

public abstract class Lexer extends Recognizer<Integer, LexerATNSimulator>
	implements TokenSource {
	public static final int DEFAULT_MODE = 0;
	public static final int MORE = -2;
	public static final int SKIP = -3;
	
	public static final int DEFAULT_TOKEN_CHANNEL = Token.DEFAULT_CHANNEL;	//0
	public static final int HIDDEN = Token.HIDDEN_CHANNEL;		//1
	public static final int MIN_CHAR_VALUE = '\u0000';	
	public static final int MAX_CHAR_VALUE = '\uFFFE';
	
	public CharStream _input;
	protected Pair<TokenSource, CharStream> _tokenFactorySourcePair;
	protected TokenFactory<?> _factory = CommonTokenFactory.DEFAULT;	//TokenFactory用来生成pari
	public Token _token;
	public int _tokenStartCharIndex = -1;
	public int _tokenStartLine;
	public int _tokenStartCharPositionInLine;
	public boolean _hitEOF;
	public int _channel;
	public int _type;
	public final IntegerStack _modeStack = new IntegerStack();
	public int _mode = Lexer.DEFAULT_MODE;
	public String _text;

	public Lexer() { }
	//构造,传入CharStream
	public Lexer() {}
	public Lexer(CharStream input) {
		this._input = input;
		this._tokenFactorySourcePair = new Pair<TokenSource, CharStream>(this, input);
	}
	
	reset
	nextToken	
	emit
	emitEOF	
	public void notifyListeners(LexerNoViableAltException e) {
		String text = _input.getText(Interval.of(_tokenStartCharIndex, _input.index()));
		String msg = "token recognition error at: '"+ getErrorDisplay(text) + "'";

		ANTLRErrorListener listener = getErrorListenerDispatch();
		listener.syntaxError(this, null, _tokenStartLine, _tokenStartCharPositionInLine, msg, e);
	}

	public void recover(RecognitionException re) {
		_input.consume();
	}

	public Token emit() {
		Token t = _factory.create(_tokenFactorySourcePair, _type, _text, _channel, _tokenStartCharIndex, getCharIndex()-1,
								  _tokenStartLine, _tokenStartCharPositionInLine);
		emit(t);
		return t;
	}

	public Token emitEOF() {
		int cpos = getCharPositionInLine();
		int line = getLine();
		Token eof = _factory.create(_tokenFactorySourcePair, Token.EOF, null, Token.DEFAULT_CHANNEL, _input.index(), _input.index()-1,
									line, cpos);
		emit(eof);
		return eof;
	}
	...
}

Token

Token具有的内容:text, type, line, 字符位置, 从哪个源获取这个token

public interface Token {
	public static final int INVALID_TYPE = 0;
    public static final int EPSILON = -2;
	public static final int MIN_USER_TOKEN_TYPE = 1;
    public static final int EOF = IntStream.EOF;
	public static final int DEFAULT_CHANNEL = 0;
	public static final int HIDDEN_CHANNEL = 1;
	public static final int MIN_USER_CHANNEL_VALUE = 2;
	String getText();
	int getType();
	int getLine();
	int getCharPositionInLine();
	int getChannel();
	int getTokenIndex();
	int getStartIndex();
	int getStopIndex();
	TokenSource getTokenSource();
	CharStream getInputStream();
}

WritableToken

public interface WritableToken extends Token {
	public void setText(String text);
	public void setType(int ttype);
	public void setLine(int line);
	public void setCharPositionInLine(int pos);
	public void setChannel(int channel);
	public void setTokenIndex(int index);
}

CommonToken
TokenSource和CharStream是一一对应的关系

public class CommonToken implements WritableToken, Serializable {
	//EMPTY_SOURCE空的
	protected static final Pair<TokenSource, CharStream> EMPTY_SOURCE = 
									new Pair<TokenSource, CharStream>(null, null);
	protected int type;
	protected int line;
	protected int charPositionInLine = -1;
	protected int channel=DEFAULT_CHANNEL;
	protected Pair<TokenSource, CharStream> source;
	protected String text;
	protected int index = -1;
	protected int start;
	protected int stop;

	//指定类型,但是没有设置source
	public CommonToken(int type) {
		this.type = type;
		this.source = EMPTY_SOURCE;
	}
	//指定类型设置source
	public CommonToken(Pair<TokenSource, CharStream> source, int type, int channel, int start, int stop) {
		this.source = source;
		this.type = type;
		this.channel = channel;
		this.start = start;
		this.stop = stop;
		if (source.a != null) {
			this.line = source.a.getLine();
			this.charPositionInLine = source.a.getCharPositionInLine();
		}
	}
	//设置type、text
	public CommonToken(int type, String text) {
		this.type = type;
		this.channel = DEFAULT_CHANNEL;
		this.text = text;
		this.source = EMPTY_SOURCE;
	}
	//另一个Token的内容完全copy过来
	public CommonToken(Token oldToken) {
		type = oldToken.getType();
		line = oldToken.getLine();
		index = oldToken.getTokenIndex();
		charPositionInLine = oldToken.getCharPositionInLine();
		channel = oldToken.getChannel();
		start = oldToken.getStartIndex();
		stop = oldToken.getStopIndex();

		if (oldToken instanceof CommonToken) {
			text = ((CommonToken)oldToken).text;
			source = ((CommonToken)oldToken).source;
		}
		else {
			text = oldToken.getText();
			source = new Pair<TokenSource, CharStream>(oldToken.getTokenSource(), oldToken.getInputStream());
		}
	}

	//获取从start到stop的inputStream内容
	public String getText() {
		if ( text!=null ) {
			return text;
		}

		CharStream input = getInputStream();
		if ( input==null ) return null;
		int n = input.size();
		if ( start<n && stop<n) {
			return input.getText(Interval.of(start,stop));
		}
		else {
			return "<EOF>";
		}
	}
	
	...一些set和get方法
}

Pair
CommonToken代码中有Pair的概念,比如:protected Pair<TokenSource, CharStream> source
首先传入泛型A、B

public class Pair<A,B> implements Serializable {
	public final A a;
	public final B b;
	//构造的时候设置a、b的内容
	public Pair(A a, B b) {
		this.a = a;
		this.b = b;
	}
	equals/hashCode/toString
}

代码中可以看出,Pair的作用就是存储a和b两个值;

IntStream

IntStream
上面的CharStream继承自IntStream,这里的InputStream是Antlr定义的
一个简答的符号流,输入的符号都是整数

public interface IntStream {
	public static final int EOF = -1;	//代表流的结尾
	public static final String UNKNOWN_SOURCE_NAME = "<unknown>";	//getSourceName获取未知名字
	void consume();				//消费现在的输入符号
	int LA(int i);				//获取特定offset的符号
	int mark();
	void release(int marker);
	int index();				//
	void seek(int index);		//设置指针未知
	int size();					//返回流中所有符号的数量,包含EOF
	public String getSourceName();
}

CharStream
CharStream接口入下

public interface CharStream extends IntStream {
	//返回一个范围的符号字符
	public String getText(Interval interval);
}

ANTLRInputStream
属性

public class ANTLRInputStream implements CharStream {
    public static final int READ_BUFFER_SIZE = 1024;
   	public static final int INITIAL_BUFFER_SIZE = 1024;

	//数据存储未知
	protected char[] data;

	//buffer中有多少字符
	protected int n;

	/** 0..n-1 index into string of next char */
	protected int p=0;

	//这个CharStream的名字
	public String name;


}

构造方法很多,目的是初始化data和n

    public ANTLRInputStream() { }

	/** Copy data in string to a local char array */
	public ANTLRInputStream(String input) {
		this.data = input.toCharArray();
		this.n = input.length();
	}

	/** This is the preferred constructor for strings as no data is copied */
	public ANTLRInputStream(char[] data, int numberOfActualCharsInArray) {
		this.data = data;
		this.n = numberOfActualCharsInArray;
	}

    public ANTLRInputStream(Reader r) throws IOException {
        this(r, INITIAL_BUFFER_SIZE, READ_BUFFER_SIZE);
    }

    public ANTLRInputStream(Reader r, int initialSize) throws IOException {
        this(r, initialSize, READ_BUFFER_SIZE);
    }

    public ANTLRInputStream(Reader r, int initialSize, int readChunkSize) throws IOException {
        load(r, initialSize, readChunkSize);
    }

	public ANTLRInputStream(InputStream input) throws IOException {
		this(new InputStreamReader(input), INITIAL_BUFFER_SIZE);
	}

	public ANTLRInputStream(InputStream input, int initialSize) throws IOException {
		this(new InputStreamReader(input), initialSize);
	}

	public ANTLRInputStream(InputStream input, int initialSize, int readChunkSize) throws IOException {
		this(new InputStreamReader(input), initialSize, readChunkSize);
	}

其他方法

reset	//复位流
//就是p指针++
public void consume() {
	if (p >= n) {
		assert LA(1) == IntStream.EOF;
		throw new IllegalStateException("cannot consume EOF");
	}

	//System.out.println("prev p="+p+", c="+(char)data[p]);
    if ( p < n ) {
        p++;
		//System.out.println("p moves to "+p+" (c='"+(char)data[p]+"')");
    }
}

public int LA(int i) {
	if ( i==0 ) {
		return 0; // undefined
	}
	if ( i<0 ) {
		i++; // e.g., translate LA(-1) to use offset i=0; then data[p+0-1]
		if ( (p+i-1) < 0 ) {
			return IntStream.EOF; // invalid; no char before first char
		}
	}

	if ( (p+i-1) >= n ) {
        //System.out.println("char LA("+i+")=EOF; p="+p);
        return IntStream.EOF;
    }
    //System.out.println("char LA("+i+")="+(char)data[p+i-1]+"; p="+p);
	//System.out.println("LA("+i+"); p="+p+" n="+n+" data.length="+data.length);
	return data[p+i-1];
}

index		//p
size		//n
mark		//-1
release		//空

//如果seek forward会对p都进行consume
public void seek(int index) {
	if ( index<=p ) {
		p = index; // just jump; don't update stream state (line, ...)
		return;
	}
	// seek forward, consume until p hits index or n (whichever comes first)
	index = Math.min(index, n);
	while ( p<index ) {
		consume();
	}
}

//返回Interval中指定范围的字符内容
public String getText(Interval interval) {
	int start = interval.a;
	int stop = interval.b;
	if ( stop >= n ) stop = n-1;
	int count = stop - start + 1;
	if ( start >= n ) return "";
	return new String(data, start, count);
}

public String getSourceName() {
	if (name == null || name.isEmpty()) {
		return UNKNOWN_SOURCE_NAME;
	}
	return name;
}

public String toString() { return new String(data); }

又遇到疑问,这Interval是什么东西?
LT和TA方法的作用是什么?
LA根据当前的位置返回IntStream中的内容,如果i=0没有定义,如果-1返回前一个,1返回下一个

Interval
Interval是间距的意思,代表a..b的内容

public class Interval {
	//池子大小
	public static final int INTERVAL_POOL_MAX_VALUE = 1000;

	public static final Interval INVALID = new Interval(-1,-2);
	//cache是一个数组,为池字数字+1
	static Interval[] cache = new Interval[INTERVAL_POOL_MAX_VALUE+1];

	public int a;
	public int b;

	public static int creates = 0;
	public static int misses = 0;
	public static int hits = 0;
	public static int outOfRange = 0;

	//构造时,设置a和b
	public Interval(int a, int b) { this.a=a; this.b=b; }

	//工厂方法,只缓存a..a的情况
	public static Interval of(int a, int b) {
		// cache just a..a
		if ( a!=b || a<0 || a>INTERVAL_POOL_MAX_VALUE ) {
			return new Interval(a,b);
		}
		if ( cache[a]==null ) {
			cache[a] = new Interval(a,a);
		}
		return cache[a];
	}
	public int length() {
		if ( b<a ) return 0;
		return b-a+1;
	}
	equals	//a和b都相同

	...两个Interval之间的处理操做
}
posted @ 2016-12-14 09:52  zhangshihai1232  阅读(750)  评论(0)    收藏  举报