Beispiel #1
0
            internal float SetPercentUniqueForLastDoc(NLPTextDocument document)
            {
                lock (stringHashes)
                {
                    int charCount       = 0;
                    int uniqueCharCount = 0;
                    foreach (var str in document.TextStrings)
                    {
                        charCount += str.Length;
                        var hashCode = str.GetHashCode();
                        if (!stringHashes.Contains(hashCode))
                        {
                            stringHashes.Add(hashCode);
                            uniqueCharCount += str.Length;
                        }
                    }
                    var percent = (charCount > 0) ? (uniqueCharCount / (float)charCount) : 1;

                    lastDocIndex++;
                    if (lastDocIndex >= percentUniqueForLastDocs.Length)
                    {
                        lastDocIndex = 0;
                    }
                    percentUniqueForLastDocs[lastDocIndex] = percent;

                    return(percent);
                }
            }
Beispiel #2
0
 /// <summary>
 /// Start of the tree traversal at the document level
 /// </summary>
 public NLPTextDocument ConvertToNLPTextDocument()
 {
     if (textDocument == null)
     {
         docBuilder       = new NLPTextDocumentBuilder(absoluteUri);
         textBuilderStack = new Stack <StringBuilder>();
         tableCoordsStack = new Stack <TableCoords>();
         if (htmlDocument.HasChildNodes)
         {
             // Analyse document structure to find where to attach the section headers
             AnalyseDocumentStructureToDelimitSections();
             // Traverse the tree of the Html document
             VisitChildNodes(htmlDocument);
         }
         textDocument     = docBuilder.TextDocument;
         tableCoordsStack = null;
         textBuilderStack = null;
         docBuilder       = null;
     }
     return(textDocument);
 }