public override sealed bool IncrementToken() { if (m_tokens.Count > 0) { Debug.Assert(current != null); CompoundToken token = m_tokens.First.Value; m_tokens.Remove(token); RestoreState(current); // keep all other attributes untouched m_termAtt.SetEmpty().Append(token.Text); m_offsetAtt.SetOffset(token.StartOffset, token.EndOffset); posIncAtt.PositionIncrement = 0; return(true); } current = null; // not really needed, but for safety if (m_input.IncrementToken()) { // Only words longer than minWordSize get processed if (m_termAtt.Length >= this.m_minWordSize) { Decompose(); // only capture the state if we really need it for producing new tokens if (m_tokens.Count > 0) { current = CaptureState(); } } // return original token: return(true); } else { return(false); } }
protected override void Decompose() { // get the hyphenation points Hyphenation.Hyphenation hyphens = hyphenator.Hyphenate(m_termAtt.Buffer, 0, m_termAtt.Length, 1, 1); // No hyphen points found -> exit if (hyphens is null) { return; } int[] hyp = hyphens.HyphenationPoints; for (int i = 0; i < hyp.Length; ++i) { int remaining = hyp.Length - i; int start = hyp[i]; CompoundToken longestMatchToken = null; for (int j = 1; j < remaining; j++) { int partLength = hyp[i + j] - start; // if the part is longer than maxSubwordSize we // are done with this round if (partLength > this.m_maxSubwordSize) { break; } // we only put subwords to the token stream // that are longer than minPartSize if (partLength < this.m_minSubwordSize) { // BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the // calculation above, and we rely upon minSubwordSize being >=0 to filter them out... continue; } // check the dictionary if (m_dictionary is null || m_dictionary.Contains(m_termAtt.Buffer, start, partLength)) { if (this.m_onlyLongestMatch) { if (longestMatchToken != null) { if (longestMatchToken.Text.Length < partLength) { longestMatchToken = new CompoundToken(this, start, partLength); } } else { longestMatchToken = new CompoundToken(this, start, partLength); } } else { m_tokens.Enqueue(new CompoundToken(this, start, partLength)); } }
public string RenderToHtml(string markdown) { if (markdown == null) { throw new ArgumentNullException(nameof(markdown)); } var reader = new TokenReader(new MdLanguage(), markdown); var token = new CompoundToken(reader.ReadTokens()); var renderer = new HtmlTokenRenderer(); return(renderer.Render(token)); }
protected override void Decompose() { int len = m_termAtt.Length; for (int i = 0; i <= len - this.m_minSubwordSize; ++i) { CompoundToken longestMatchToken = null; for (int j = this.m_minSubwordSize; j <= this.m_maxSubwordSize; ++j) { if (i + j > len) { break; } if (m_dictionary.Contains(m_termAtt.Buffer, i, j)) { if (this.m_onlyLongestMatch) { if (longestMatchToken != null) { if (longestMatchToken.Text.Length < j) { longestMatchToken = new CompoundToken(this, i, j); } } else { longestMatchToken = new CompoundToken(this, i, j); } } else { m_tokens.Enqueue(new CompoundToken(this, i, j)); } } } if (this.m_onlyLongestMatch && longestMatchToken != null) { m_tokens.Enqueue(longestMatchToken); } } }
protected internal override void Decompose() { int len = termAtt.Length; for (int i = 0; i <= len - this.minSubwordSize; ++i) { CompoundToken longestMatchToken = null; for (int j = this.minSubwordSize; j <= this.maxSubwordSize; ++j) { if (i + j > len) { break; } if (dictionary.Contains(termAtt.Buffer(), i, j)) { if (this.onlyLongestMatch) { if (longestMatchToken != null) { if (longestMatchToken.txt.Length < j) { longestMatchToken = new CompoundToken(this, i, j); } } else { longestMatchToken = new CompoundToken(this, i, j); } } else { tokens.AddLast(new CompoundToken(this, i, j)); } } } if (this.onlyLongestMatch && longestMatchToken != null) { tokens.AddLast(longestMatchToken); } } }
protected internal override void decompose() { // get the hyphenation points Hyphenation hyphens = hyphenator.hyphenate(termAtt.Buffer(), 0, termAtt.Length(), 1, 1); // No hyphen points found -> exit if (hyphens == null) { return; } int[] hyp = hyphens.HyphenationPoints; for (int i = 0; i < hyp.Length; ++i) { int remaining = hyp.Length - i; int start = hyp[i]; CompoundToken longestMatchToken = null; for (int j = 1; j < remaining; j++) { int partLength = hyp[i + j] - start; // if the part is longer than maxSubwordSize we // are done with this round if (partLength > this.maxSubwordSize) { break; } // we only put subwords to the token stream // that are longer than minPartSize if (partLength < this.minSubwordSize) { // BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the // calculation above, and we rely upon minSubwordSize being >=0 to filter them out... continue; } // check the dictionary if (dictionary == null || dictionary.Contains(termAtt.Buffer(), start, partLength)) { if (this.onlyLongestMatch) { if (longestMatchToken != null) { if (longestMatchToken.txt.Length() < partLength) { longestMatchToken = new CompoundToken(this, start, partLength); } } else { longestMatchToken = new CompoundToken(this, start, partLength); } } else { tokens.AddLast(new CompoundToken(this, start, partLength)); } } else if (dictionary.contains(termAtt.buffer(), start, partLength - 1)) { // check the dictionary again with a word that is one character // shorter // to avoid problems with genitive 's characters and other binding // characters if (this.onlyLongestMatch) { if (longestMatchToken != null) { if (longestMatchToken.txt.Length() < partLength - 1) { longestMatchToken = new CompoundToken(this, start, partLength - 1); } } else { longestMatchToken = new CompoundToken(this, start, partLength - 1); } } else { tokens.AddLast(new CompoundToken(this, start, partLength - 1)); } } } if (this.onlyLongestMatch && longestMatchToken != null) { tokens.AddLast(longestMatchToken); } } }
private void parseLexemes() { var children = new List <Token>(); lexInd = 0; /* * <module> ::= { <import> } /\n+/ * <export> [ <implement> ] /\n+/ * { <definition> /\n+/ } */ // Get all the imports while (lexInd < lexemes.Length && (string)lexemes[lexInd].Source() == "imports") { var import = parseImport(); if (lexInd >= lexemes.Length) { throw new UnexpectedEOFException(import.Line()); } else if (lexemes[lexInd].Type() != TokenType.NewLine) { throw new UnexpectedTokenException( lexemes[lexInd].Type(), new TokenType[] { TokenType.NewLine }, import.Line() ); } var newLine = lexemes[lexInd++]; eatNewLines(); children.Add(import); children.Add(newLine); } // Get the exports and implements line var export = parseExport(); children.Add(export); if (lexInd >= lexemes.Length) { throw new UnexpectedEOFException(export.Line()); } else if ((string)lexemes[lexInd].Source() == "implements") { children.Add(parseImplement()); // Optional, hence if stmt } if (lexInd >= lexemes.Length) { throw new UnexpectedEOFException(export.Line()); } else if (lexemes[lexInd].Type() != TokenType.NewLine) { throw new UnexpectedTokenException( lexemes[lexInd].Type(), new TokenType[] { TokenType.NewLine }, export.Line() ); } children.Add(lexemes[lexInd++]); eatNewLines(); // Finally, parse definitions while (lexInd < lexemes.Length) { var definition = parseDefinition(); if (lexInd >= lexemes.Length) { throw new UnexpectedEOFException(definition.Line()); } else if (lexemes[lexInd].Type() != TokenType.NewLine) { throw new UnexpectedTokenException( lexemes[lexInd].Type(), new TokenType[] { TokenType.NewLine }, definition.Line() ); } var newLine3 = lexemes[lexInd++]; eatNewLines(); children.Add(definition); children.Add(newLine3); } ast = new CompoundToken(TokenType.Module, children, 1); }