internal float SetPercentUniqueForLastDoc(NLPTextDocument document) { lock (stringHashes) { int charCount = 0; int uniqueCharCount = 0; foreach (var str in document.TextStrings) { charCount += str.Length; var hashCode = str.GetHashCode(); if (!stringHashes.Contains(hashCode)) { stringHashes.Add(hashCode); uniqueCharCount += str.Length; } } var percent = (charCount > 0) ? (uniqueCharCount / (float)charCount) : 1; lastDocIndex++; if (lastDocIndex >= percentUniqueForLastDocs.Length) { lastDocIndex = 0; } percentUniqueForLastDocs[lastDocIndex] = percent; return(percent); } }
/// <summary> /// Start of the tree traversal at the document level /// </summary> public NLPTextDocument ConvertToNLPTextDocument() { if (textDocument == null) { docBuilder = new NLPTextDocumentBuilder(absoluteUri); textBuilderStack = new Stack <StringBuilder>(); tableCoordsStack = new Stack <TableCoords>(); if (htmlDocument.HasChildNodes) { // Analyse document structure to find where to attach the section headers AnalyseDocumentStructureToDelimitSections(); // Traverse the tree of the Html document VisitChildNodes(htmlDocument); } textDocument = docBuilder.TextDocument; tableCoordsStack = null; textBuilderStack = null; docBuilder = null; } return(textDocument); }