コード例 #1
0
ファイル: QueryProcessor.cs プロジェクト: mo5h/omeo
        internal static EntryProximity  EstimateProximity(Entry Left, Entry Right)
        {
            Debug.Assert(Left.DocIndex == Right.DocIndex, "Illegal precondition for calling Estimator - doc IDs are different");

            int            iLeft = 0, iRight = 0;
            EntryProximity Result = EntryProximity.Document;

            while ((iLeft < Left.Count) && (iRight < Right.Count))
            {
                InstanceOffset leftOff  = Left.Instance(iLeft);
                InstanceOffset rightOff = Right.Instance(iRight);

                if (leftOff.Sentence == rightOff.Sentence)
                {
                    Result = EntryProximity.Sentence;
                    if (isPhraseProximity(leftOff, rightOff))
                    {
                        Result = EntryProximity.Phrase;
                        break;
                    }
                }
                if (leftOff.OffsetNormal < rightOff.OffsetNormal)
                {
                    iLeft++;
                }
                else
                {
                    iRight++;
                }
            }

            return(Result);
        }
コード例 #2
0
ファイル: QueryProcessor.cs プロジェクト: mo5h/omeo
        private static void AddMappedInstances(IList tempOffsets, int docIndex, InstanceOffset inst)
        {
            long   hashCode  = HC(docIndex, inst.OffsetNormal);
            object rightInst = MappedInstances[hashCode];

            while (rightInst != null)
            {
                tempOffsets.Add((InstanceOffset)rightInst);
                hashCode  = HC(docIndex, ((InstanceOffset)rightInst).OffsetNormal);
                rightInst = MappedInstances[hashCode];
            }
        }
コード例 #3
0
ファイル: QueryProcessor.cs プロジェクト: mo5h/omeo
        //-------------------------------------------------------------------------
        private static Entry  JoinInstancesOfEntries(Entry left, Entry right,
                                                     EntryProximity requiredProximity)
        {
            Entry JoinedEntry = new Entry();

            JoinedEntry.DocIndex  = left.DocIndex;
            JoinedEntry.TfIdf     = left.TfIdf + right.TfIdf;
            JoinedEntry.Proximity = left.Proximity;
            InstanceOffset[] joinedOffsets;

            //  If required proximity is Phrasal, then we need to highlight
            //  only those terms and show only those contexts which correspond
            //  to seach term instances EXACTLY in phrases found, and not
            //  others located elsewhere in the document.
            if (requiredProximity == EntryProximity.Phrase)
            {
                //  Assumption is made that all offsets in the entries are
                //  sorted in asceding order.

                ArrayList tempOffsets = new ArrayList();
                int       leftIndex = 0, rightIndex = 0;
                while (leftIndex < left.Count && rightIndex < right.Count)
                {
                    InstanceOffset leftOff = left.Offsets[leftIndex], rightOff = right.Offsets[rightIndex];
                    if (ProximityEstimator.isPhraseProximity(leftOff, rightOff))
                    {
                        tempOffsets.Add(leftOff);
                        tempOffsets.Add(rightOff);
                        AddMappedInstances(tempOffsets, left.DocIndex, rightOff);
                        MappedInstances[HC(left.DocIndex, leftOff.OffsetNormal)] = rightOff;
                    }
                    if (leftOff.OffsetNormal < rightOff.OffsetNormal)
                    {
                        leftIndex++;
                    }
                    else
                    {
                        rightIndex++;
                    }
                }
                joinedOffsets = (InstanceOffset[])tempOffsets.ToArray(typeof(InstanceOffset));
            }
            else
            {
                joinedOffsets = new InstanceOffset[left.Count + right.Count];
                left.Offsets.CopyTo(joinedOffsets, 0);
                right.Offsets.CopyTo(joinedOffsets, left.Count);
            }
            JoinedEntry.Offsets = joinedOffsets;

            return(JoinedEntry);
        }
コード例 #4
0
ファイル: TermIndexRecord.cs プロジェクト: mo5h/omeo
        //-------------------------------------------------------------------------
        //  Parser plain sequence of bytes into the entries and their instances.
        //  Comment: Some entries may be marked as "removed", that means that
        //           corresponding documents are no longer exist. Thus field
        //           "DocsNumber" counts *ALL* entries - valid and removed, since
        //           we do not have an ability to physically strip sequence of
        //           bytes. Non-existing documents are marked with "-1" as DocID
        //           Thus we have to allocate actual space only AFTER the number of
        //           entries is known.
        //-------------------------------------------------------------------------

        protected static void ParseEntry(BinaryReader reader)
        {
            int   instancesNumber;
            Entry new_ = new Entry();

            new_.DocIndex   = IndexConstructor.ReadCount(reader);
            new_.TfIdf      = reader.ReadSingle();
            instancesNumber = IndexConstructor.ReadCount(reader) + 1;

            if (instancesNumber < 0)
            {
                throw new FormatException("TermIndexRecord -- Illegal number of instances for a TermIndex record (" + instancesNumber + ") - possible index corruption");
            }

            // NB: Discuss an OpenAPI issue for getting current maximal vlaue of document Id
            //     from the ResourceStore.
            //            if( new_.DocIndex >= 10000000 )
            //                throw( new IndexConstructor.TextIndexCorruption( "[DocIndex=" + new_.DocIndex + "] value in [TermIndex record Entry] is greater than a reasonable number of documents - possible index corruption" ));

            //-----------------------------------------------------------------
            try
            {
                if (new_.DocIndex != -1)
                {
                    InstanceOffset[] Offsets = new InstanceOffset[instancesNumber];

                    for (int j = 0; j < instancesNumber; j++)
                    {
                        Offsets[j].Offset       = reader.ReadUInt32();
                        Offsets[j].CompoundInfo = reader.ReadUInt32();
                    }
                    new_.Offsets = Offsets;
                    listTemporaryStorage.Add(new_);
                }
                else
                {
                    //  this entry has been "removed", do not use in subsequent
                    //  processing
                    new_ = null;
                }
            }
            catch (OutOfMemoryException)
            {
                throw new FormatException("TermIndexRecord - illegal number of term instances: [" + instancesNumber + "]");
            }
        }
コード例 #5
0
ファイル: ContextConstructor.cs プロジェクト: mo5h/omeo
        public static void  GetHighlightedTerms(Entry entry, string[] lexemes, out WordPtr[] anchors)
        {
            anchors = new WordPtr[entry.Count];

            Trace.WriteLine("HighlightTerms -- the following terms were processed for highlighting: ");
            for (int i = 0; i < entry.Count; i++)
            {
                InstanceOffset instance = entry.Instance(i);
                uint           offset   = instance.Offset;
                string         Lexeme   = lexemes[instance.BaseID];

                anchors[i].Original    = Lexeme;
                anchors[i].Text        = ReconstructWordform(offset, Lexeme, OMEnv.DictionaryServer);
                anchors[i].StartOffset = instance.OffsetNormal;
                anchors[i].SectionId   = (int)instance.SectionId;
                anchors[i].Section     = DocSectionHelper.FullNameByOrder(instance.SectionId);

//  trace section
                Trace.WriteLine("      [" + anchors[i].Text + "] at " + instance.OffsetNormal +
                                ", section " + anchors[i].Section + ", sentence " + instance.Sentence);
//  end trace section
            }
            Array.Sort(anchors, new AnchorComparer());
        }
コード例 #6
0
ファイル: QueryProcessor.cs プロジェクト: mo5h/omeo
 internal static bool  isPhraseProximity(InstanceOffset left, InstanceOffset right)
 {
     return((left.TokenOrder - right.TokenOrder) == -1);
 }
コード例 #7
0
ファイル: ContextConstructor.cs プロジェクト: mo5h/omeo
        public static string  GetContext(Entry termEntry, string[] lexemes, out ArrayList hgltPairs)
        {
            string context        = cNoContextSign;
            int    contextsNumber = Math.Min(MinimalNumberOfContexts, termEntry.Count);

            int[] shifts = new int[termEntry.Count];
            hgltPairs = new ArrayList();
            Collector.Init(termEntry.Offsets, shifts);

            try
            {
                //  it is possible situation when temporary file is removed
                //  during this processing.
                IResource res = Core.ResourceStore.TryLoadResource(termEntry.DocIndex);
                if (res != null)
                {
                    Core.PluginLoader.InvokeResourceTextProviders(res, Collector);

                    if (Collector.Body.Length > 0)
                    {
                        context = cFragmentsDelimiter;
                        int leftBorder = Int32.MaxValue, rightBorder = Int32.MinValue;
                        int prevContextLength = 0;
                        for (int i = 0; i < contextsNumber; i++)
                        {
                            InstanceOffset instance         = termEntry.Instance(i);
                            int            origOffset       = instance.OffsetNormal;
                            int            offset           = Collector.ConvertOffset(origOffset, instance.SectionId);
                            ArrayList      delimiterOffsets = new ArrayList();

                            //  workaround of possible invalid text body reconstruction
                            //  by plugin, when search terms appear out of the text margins...
                            if (offset < Collector.Body.Length)
                            {
                                if (offset < leftBorder || offset > rightBorder)
                                {
                                    leftBorder  = Math.Max(0, offset - cContextSideLength);
                                    rightBorder = Math.Min(Collector.Body.Length - 1, offset + cContextSideLength);
                                    TuneBorders(offset, Collector.Body, ref leftBorder, ref rightBorder);

                                    string fragment = Collector.Body.Substring(leftBorder, rightBorder - leftBorder + 1);
                                    InsertSectionDelimiters(ref fragment, leftBorder, rightBorder, context.Length, delimiterOffsets);

                                    prevContextLength = context.Length;
                                    context          += fragment + cFragmentsDelimiter;
                                }
                                else
                                if (contextsNumber < termEntry.Count)
                                {
                                    contextsNumber++;
                                }

                                int    startOffset = offset - leftBorder + prevContextLength;
                                string lexeme      = lexemes[instance.BaseID];
                                lexeme = ReconstructWordform(instance.Offset, lexeme, OMEnv.DictionaryServer);
                                TuneOffsetByBorders(ref startOffset, delimiterOffsets);

                                hgltPairs.Add(new OffsetData(startOffset, lexeme.Length));
                            }
                        }
                        context = context.Replace("\r\n", "  ");
                        context = context.Replace("\n", " ");
                        context = context.Replace("\r", " ");
                        context = context.Replace("\t", " ");
                        Trace.WriteLine("ContextExtractor -- context for [" + termEntry.DocIndex + "/" + res.Type + "] is [" + context + "]");
                        foreach (OffsetData pair in hgltPairs)
                        {
                            if (pair.Start + pair.Length >= context.Length)
                            {
                                Trace.WriteLine("                  highlight prefix of token [" + context.Substring(pair.Start) + "]");
                            }
                            else
                            {
                                Trace.WriteLine("                  highlight token [" + context.Substring(pair.Start, pair.Length) + "]");
                            }
                        }
                    }
                }
            }
            catch
            {
                //  Here we catch exceptions described in the OM-10659, reason
                //  for which is still is not found. Just hide the bug.
            }

            return(context);
        }