internal static EntryProximity EstimateProximity(Entry Left, Entry Right) { Debug.Assert(Left.DocIndex == Right.DocIndex, "Illegal precondition for calling Estimator - doc IDs are different"); int iLeft = 0, iRight = 0; EntryProximity Result = EntryProximity.Document; while ((iLeft < Left.Count) && (iRight < Right.Count)) { InstanceOffset leftOff = Left.Instance(iLeft); InstanceOffset rightOff = Right.Instance(iRight); if (leftOff.Sentence == rightOff.Sentence) { Result = EntryProximity.Sentence; if (isPhraseProximity(leftOff, rightOff)) { Result = EntryProximity.Phrase; break; } } if (leftOff.OffsetNormal < rightOff.OffsetNormal) { iLeft++; } else { iRight++; } } return(Result); }
private static void AddMappedInstances(IList tempOffsets, int docIndex, InstanceOffset inst) { long hashCode = HC(docIndex, inst.OffsetNormal); object rightInst = MappedInstances[hashCode]; while (rightInst != null) { tempOffsets.Add((InstanceOffset)rightInst); hashCode = HC(docIndex, ((InstanceOffset)rightInst).OffsetNormal); rightInst = MappedInstances[hashCode]; } }
//------------------------------------------------------------------------- private static Entry JoinInstancesOfEntries(Entry left, Entry right, EntryProximity requiredProximity) { Entry JoinedEntry = new Entry(); JoinedEntry.DocIndex = left.DocIndex; JoinedEntry.TfIdf = left.TfIdf + right.TfIdf; JoinedEntry.Proximity = left.Proximity; InstanceOffset[] joinedOffsets; // If required proximity is Phrasal, then we need to highlight // only those terms and show only those contexts which correspond // to seach term instances EXACTLY in phrases found, and not // others located elsewhere in the document. if (requiredProximity == EntryProximity.Phrase) { // Assumption is made that all offsets in the entries are // sorted in asceding order. ArrayList tempOffsets = new ArrayList(); int leftIndex = 0, rightIndex = 0; while (leftIndex < left.Count && rightIndex < right.Count) { InstanceOffset leftOff = left.Offsets[leftIndex], rightOff = right.Offsets[rightIndex]; if (ProximityEstimator.isPhraseProximity(leftOff, rightOff)) { tempOffsets.Add(leftOff); tempOffsets.Add(rightOff); AddMappedInstances(tempOffsets, left.DocIndex, rightOff); MappedInstances[HC(left.DocIndex, leftOff.OffsetNormal)] = rightOff; } if (leftOff.OffsetNormal < rightOff.OffsetNormal) { leftIndex++; } else { rightIndex++; } } joinedOffsets = (InstanceOffset[])tempOffsets.ToArray(typeof(InstanceOffset)); } else { joinedOffsets = new InstanceOffset[left.Count + right.Count]; left.Offsets.CopyTo(joinedOffsets, 0); right.Offsets.CopyTo(joinedOffsets, left.Count); } JoinedEntry.Offsets = joinedOffsets; return(JoinedEntry); }
//------------------------------------------------------------------------- // Parser plain sequence of bytes into the entries and their instances. // Comment: Some entries may be marked as "removed", that means that // corresponding documents are no longer exist. Thus field // "DocsNumber" counts *ALL* entries - valid and removed, since // we do not have an ability to physically strip sequence of // bytes. Non-existing documents are marked with "-1" as DocID // Thus we have to allocate actual space only AFTER the number of // entries is known. //------------------------------------------------------------------------- protected static void ParseEntry(BinaryReader reader) { int instancesNumber; Entry new_ = new Entry(); new_.DocIndex = IndexConstructor.ReadCount(reader); new_.TfIdf = reader.ReadSingle(); instancesNumber = IndexConstructor.ReadCount(reader) + 1; if (instancesNumber < 0) { throw new FormatException("TermIndexRecord -- Illegal number of instances for a TermIndex record (" + instancesNumber + ") - possible index corruption"); } // NB: Discuss an OpenAPI issue for getting current maximal vlaue of document Id // from the ResourceStore. // if( new_.DocIndex >= 10000000 ) // throw( new IndexConstructor.TextIndexCorruption( "[DocIndex=" + new_.DocIndex + "] value in [TermIndex record Entry] is greater than a reasonable number of documents - possible index corruption" )); //----------------------------------------------------------------- try { if (new_.DocIndex != -1) { InstanceOffset[] Offsets = new InstanceOffset[instancesNumber]; for (int j = 0; j < instancesNumber; j++) { Offsets[j].Offset = reader.ReadUInt32(); Offsets[j].CompoundInfo = reader.ReadUInt32(); } new_.Offsets = Offsets; listTemporaryStorage.Add(new_); } else { // this entry has been "removed", do not use in subsequent // processing new_ = null; } } catch (OutOfMemoryException) { throw new FormatException("TermIndexRecord - illegal number of term instances: [" + instancesNumber + "]"); } }
public static void GetHighlightedTerms(Entry entry, string[] lexemes, out WordPtr[] anchors) { anchors = new WordPtr[entry.Count]; Trace.WriteLine("HighlightTerms -- the following terms were processed for highlighting: "); for (int i = 0; i < entry.Count; i++) { InstanceOffset instance = entry.Instance(i); uint offset = instance.Offset; string Lexeme = lexemes[instance.BaseID]; anchors[i].Original = Lexeme; anchors[i].Text = ReconstructWordform(offset, Lexeme, OMEnv.DictionaryServer); anchors[i].StartOffset = instance.OffsetNormal; anchors[i].SectionId = (int)instance.SectionId; anchors[i].Section = DocSectionHelper.FullNameByOrder(instance.SectionId); // trace section Trace.WriteLine(" [" + anchors[i].Text + "] at " + instance.OffsetNormal + ", section " + anchors[i].Section + ", sentence " + instance.Sentence); // end trace section } Array.Sort(anchors, new AnchorComparer()); }
internal static bool isPhraseProximity(InstanceOffset left, InstanceOffset right) { return((left.TokenOrder - right.TokenOrder) == -1); }
public static string GetContext(Entry termEntry, string[] lexemes, out ArrayList hgltPairs) { string context = cNoContextSign; int contextsNumber = Math.Min(MinimalNumberOfContexts, termEntry.Count); int[] shifts = new int[termEntry.Count]; hgltPairs = new ArrayList(); Collector.Init(termEntry.Offsets, shifts); try { // it is possible situation when temporary file is removed // during this processing. IResource res = Core.ResourceStore.TryLoadResource(termEntry.DocIndex); if (res != null) { Core.PluginLoader.InvokeResourceTextProviders(res, Collector); if (Collector.Body.Length > 0) { context = cFragmentsDelimiter; int leftBorder = Int32.MaxValue, rightBorder = Int32.MinValue; int prevContextLength = 0; for (int i = 0; i < contextsNumber; i++) { InstanceOffset instance = termEntry.Instance(i); int origOffset = instance.OffsetNormal; int offset = Collector.ConvertOffset(origOffset, instance.SectionId); ArrayList delimiterOffsets = new ArrayList(); // workaround of possible invalid text body reconstruction // by plugin, when search terms appear out of the text margins... if (offset < Collector.Body.Length) { if (offset < leftBorder || offset > rightBorder) { leftBorder = Math.Max(0, offset - cContextSideLength); rightBorder = Math.Min(Collector.Body.Length - 1, offset + cContextSideLength); TuneBorders(offset, Collector.Body, ref leftBorder, ref rightBorder); string fragment = Collector.Body.Substring(leftBorder, rightBorder - leftBorder + 1); InsertSectionDelimiters(ref fragment, leftBorder, rightBorder, context.Length, delimiterOffsets); prevContextLength = context.Length; context += fragment + cFragmentsDelimiter; } else if (contextsNumber < termEntry.Count) { contextsNumber++; } int startOffset = offset - leftBorder + prevContextLength; string lexeme = lexemes[instance.BaseID]; lexeme = ReconstructWordform(instance.Offset, lexeme, OMEnv.DictionaryServer); TuneOffsetByBorders(ref startOffset, delimiterOffsets); hgltPairs.Add(new OffsetData(startOffset, lexeme.Length)); } } context = context.Replace("\r\n", " "); context = context.Replace("\n", " "); context = context.Replace("\r", " "); context = context.Replace("\t", " "); Trace.WriteLine("ContextExtractor -- context for [" + termEntry.DocIndex + "/" + res.Type + "] is [" + context + "]"); foreach (OffsetData pair in hgltPairs) { if (pair.Start + pair.Length >= context.Length) { Trace.WriteLine(" highlight prefix of token [" + context.Substring(pair.Start) + "]"); } else { Trace.WriteLine(" highlight token [" + context.Substring(pair.Start, pair.Length) + "]"); } } } } } catch { // Here we catch exceptions described in the OM-10659, reason // for which is still is not found. Just hide the bug. } return(context); }