public ScoredDocument(DocumentTableRow tableRow, double score) { if (tableRow == null) { throw new ArgumentNullException("document"); } TableRow = tableRow; Score = score; }
public static IDictionary <string, short> ToKeyIndex(this DocumentTableRow document) { var keys = document.Fields.Keys.ToList(); var keyIndex = new Dictionary <string, short>(); for (int i = 0; i < keys.Count; i++) { keyIndex.Add(keys[i], (short)i); } return(keyIndex); }
public void Write(DocumentTableRow document, IWriteSession session) { var analyzedTerms = _analyzer.AnalyzeDocument(document); foreach (var term in analyzedTerms) { _treeBuilder.Add(term.Field, term.Value, term); } session.Write(document); Log.DebugFormat("analyzed doc ID {0}", document.TableId); }
private IEnumerable <DocumentTableRow> ReadInternal() { var files = Directory.GetFiles(_directory, "*.zip", SearchOption.AllDirectories); var skipped = 0; var took = 0; foreach (var zipFileName in files) { if (_skip > 0 && skipped++ < _skip) { continue; } if (took == _take) { break; } DocumentTableRow document = null; try { using (var fileStream = new FileStream(zipFileName, FileMode.Open)) using (var zip = new ZipArchive(fileStream, ZipArchiveMode.Read)) { ZipArchiveEntry txtFile = null; foreach (var entry in zip.Entries) { if (entry.Name.EndsWith(".txt")) { txtFile = entry; break; } } if (txtFile != null) { using (var txtStream = txtFile.Open()) using (var reader = new StreamReader(txtStream)) { var title = reader.ReadLine() + " " + reader.ReadLine(); var head = new StringBuilder(); var couldNotRead = false; string encoding = null; while (true) { var line = reader.ReadLine(); if (line == null) { couldNotRead = true; break; } else if (line.Contains("*** ")) { break; } if (line.Contains("encoding: ASCII")) { encoding = line; } else { head.Append(" "); head.Append(line); } } if (encoding == null || couldNotRead) { continue; } var body = reader.ReadToEnd(); document = new DocumentTableRow( new List <Field> { new Field("title", title), new Field("head", head), new Field("body", body), new Field("uri", zipFileName.Replace(_directory, "")) }); } } } } catch (Exception ex) { Log.InfoFormat("unreadable file: {0} {1}", zipFileName, ex.Message); continue; } if (document != null) { yield return(document); took++; } } }