public List<ProgramElement> Parse(string filename) { var programElements = new List<ProgramElement>(); XmlTextReader reader = new XmlTextReader(filename); while (reader.Read()) { string text = String.Empty; if (reader.NodeType == XmlNodeType.Text) { text = reader.Value; } else if (reader.NodeType == XmlNodeType.Element) { while (reader.MoveToNextAttribute()) { text += reader.Value + " "; } } if (!String.IsNullOrWhiteSpace(text)) { var cleanedText = text.TrimStart(' ', '\n', '\r', '\t'); cleanedText = cleanedText.TrimEnd(' ', '\n', '\r', '\t'); var linenum = reader.LineNumber; var snippet = SrcMLParsingUtils.RetrieveSource(cleanedText); var pe = new TextFileElement(filename, snippet, cleanedText); programElements.Add(pe); } } return programElements; }
public List<ProgramElement> Parse(string filename) { var list = new List<ProgramElement>(); var termSeparators = new char[] {' ', '\n', '\t', '\r'}; try { int charactersInCurrentChunk = 0; int currentLineNumber = 1; int startingLineNumber = 1; using (var sr = new StreamReader(filename)) { StringBuilder fileText = new StringBuilder(); //fileText.Append(Environment.NewLine); //in order to start line numbers at 1 instead of 0 string line = String.Empty; while ((line = sr.ReadLine()) != null) { fileText.Append(line + Environment.NewLine); charactersInCurrentChunk += line.Length; currentLineNumber++; if (charactersInCurrentChunk >= MaxLengthOfTermInLucene) { if (!String.IsNullOrWhiteSpace(fileText.ToString())) { var element = new TextFileElement(startingLineNumber, 0, filename, fileText.ToString(), fileText.ToString()); list.Add(element); startingLineNumber = currentLineNumber; charactersInCurrentChunk = 0; fileText = new StringBuilder(); } } } var fileString = fileText.ToString(); if (!String.IsNullOrWhiteSpace(fileString)) { var element = new TextFileElement(startingLineNumber, 0, filename, fileText.ToString(), fileText.ToString()); list.Add(element); } } } catch (Exception e) { LogEvents.ParserGenericFileError(this, filename); } return list; }
private static IEnumerable<string> ExtractTextLineElement(TextFileElement element) { return GetDefaultLetterWords(element.Body); }