private List <ExtractResult> MergeMultipleDuration(string text, List <ExtractResult> extractorResults)
        {
            if (extractorResults.Count <= 1)
            {
                return(extractorResults);
            }

            var unitMap              = this.config.UnitMap;
            var unitValueMap         = this.config.UnitValueMap;
            var unitRegex            = this.config.DurationUnitRegex;
            List <ExtractResult> ret = new List <ExtractResult>();

            var firstExtractionIndex = 0;
            var timeUnit             = 0;
            var totalUnit            = 0;

            while (firstExtractionIndex < extractorResults.Count)
            {
                string curUnit   = null;
                var    unitMatch = unitRegex.Match(extractorResults[firstExtractionIndex].Text);

                if (unitMatch.Success && unitMap.ContainsKey(unitMatch.Groups["unit"].ToString()))
                {
                    curUnit = unitMatch.Groups["unit"].ToString();
                    totalUnit++;
                    if (DurationParsingUtil.IsTimeDurationUnit(unitMap[curUnit]))
                    {
                        timeUnit++;
                    }
                }

                if (string.IsNullOrEmpty(curUnit))
                {
                    firstExtractionIndex++;
                    continue;
                }

                var secondExtractionIndex = firstExtractionIndex + 1;
                while (secondExtractionIndex < extractorResults.Count)
                {
                    var valid       = false;
                    var midStrBegin = extractorResults[secondExtractionIndex - 1].Start + extractorResults[secondExtractionIndex - 1].Length ?? 0;
                    var midStrEnd   = extractorResults[secondExtractionIndex].Start ?? 0;
                    if (midStrBegin > midStrEnd)
                    {
                        return(extractorResults);
                    }

                    var midStr = text.Substring(midStrBegin, midStrEnd - midStrBegin);
                    var match  = this.config.DurationConnectorRegex.Match(midStr);
                    if (match.Success)
                    {
                        unitMatch = unitRegex.Match(extractorResults[secondExtractionIndex].Text);
                        if (unitMatch.Success && unitMap.ContainsKey(unitMatch.Groups["unit"].ToString()))
                        {
                            var nextUnitStr = unitMatch.Groups["unit"].ToString();
                            if (unitValueMap[unitMap[nextUnitStr]] != unitValueMap[unitMap[curUnit]])
                            {
                                valid = true;
                                if (unitValueMap[unitMap[nextUnitStr]] < unitValueMap[unitMap[curUnit]])
                                {
                                    curUnit = nextUnitStr;
                                }
                            }

                            totalUnit++;
                            if (DurationParsingUtil.IsTimeDurationUnit(unitMap[nextUnitStr]))
                            {
                                timeUnit++;
                            }
                        }
                    }

                    if (!valid)
                    {
                        break;
                    }

                    secondExtractionIndex++;
                }

                if (secondExtractionIndex - 1 > firstExtractionIndex)
                {
                    var node = new ExtractResult();
                    node.Start  = extractorResults[firstExtractionIndex].Start;
                    node.Length = extractorResults[secondExtractionIndex - 1].Start + extractorResults[secondExtractionIndex - 1].Length - node.Start;
                    node.Text   = text.Substring(node.Start ?? 0, node.Length ?? 0);
                    node.Type   = extractorResults[firstExtractionIndex].Type;

                    // Add multiple duration type to extract result
                    string type = Constants.MultipleDuration_DateTime; // Default type
                    if (timeUnit == totalUnit)
                    {
                        type = Constants.MultipleDuration_Time;
                    }
                    else if (timeUnit == 0)
                    {
                        type = Constants.MultipleDuration_Date;
                    }

                    node.Data = type;

                    ret.Add(node);

                    timeUnit  = 0;
                    totalUnit = 0;
                }
                else
                {
                    ret.Add(extractorResults[firstExtractionIndex]);
                }

                firstExtractionIndex = secondExtractionIndex;
            }

            return(ret);
        }
Exemplo n.º 2
0
        private List <ExtractResult> MergeMultipleDuration(string text, List <ExtractResult> extractorResults)
        {
            if (extractorResults.Count <= 1)
            {
                return(extractorResults);
            }

            var unitMap      = this.config.UnitMap;
            var unitValueMap = this.config.UnitValueMap;
            var unitRegex    = this.config.DurationUnitRegex;
            List <ExtractResult>         results         = new List <ExtractResult>();
            List <List <ExtractResult> > separateResults = new List <List <ExtractResult> >();

            var firstExtractionIndex = 0;
            var timeUnit             = 0;
            var totalUnit            = 0;

            while (firstExtractionIndex < extractorResults.Count)
            {
                string curUnit   = null;
                var    unitMatch = unitRegex.Match(extractorResults[firstExtractionIndex].Text);

                if (unitMatch.Success && unitMap.ContainsKey(unitMatch.Groups["unit"].ToString()))
                {
                    curUnit = unitMatch.Groups["unit"].ToString();
                    totalUnit++;
                    if (DurationParsingUtil.IsTimeDurationUnit(unitMap[curUnit]))
                    {
                        timeUnit++;
                    }
                }

                if (string.IsNullOrEmpty(curUnit))
                {
                    firstExtractionIndex++;
                    continue;
                }

                // Add extraction to list of separate results (needed in case the extractions should not be merged)
                List <ExtractResult> separateList = new List <ExtractResult>()
                {
                    extractorResults[firstExtractionIndex]
                };

                var secondExtractionIndex = firstExtractionIndex + 1;
                while (secondExtractionIndex < extractorResults.Count)
                {
                    var valid       = false;
                    var midStrBegin = extractorResults[secondExtractionIndex - 1].Start + extractorResults[secondExtractionIndex - 1].Length ?? 0;
                    var midStrEnd   = extractorResults[secondExtractionIndex].Start ?? 0;
                    var midStr      = text.Substring(midStrBegin, midStrEnd - midStrBegin);
                    var match       = this.config.DurationConnectorRegex.Match(midStr);
                    if (match.Success)
                    {
                        unitMatch = unitRegex.Match(extractorResults[secondExtractionIndex].Text);
                        if (unitMatch.Success && unitMap.ContainsKey(unitMatch.Groups["unit"].ToString()))
                        {
                            var nextUnitStr = unitMatch.Groups["unit"].ToString();
                            if (unitValueMap[nextUnitStr] != unitValueMap[curUnit])
                            {
                                valid = true;
                                if (unitValueMap[nextUnitStr] < unitValueMap[curUnit])
                                {
                                    curUnit = nextUnitStr;
                                }
                            }

                            totalUnit++;
                            if (DurationParsingUtil.IsTimeDurationUnit(unitMap[nextUnitStr]))
                            {
                                timeUnit++;
                            }
                        }
                    }

                    if (!valid)
                    {
                        break;
                    }

                    // Add extraction to list of separate results (needed in case the extractions should not be merged)
                    separateList.Add(extractorResults[secondExtractionIndex]);

                    secondExtractionIndex++;
                }

                if (secondExtractionIndex - 1 > firstExtractionIndex)
                {
                    var node = new ExtractResult();
                    node.Start  = extractorResults[firstExtractionIndex].Start;
                    node.Length = extractorResults[secondExtractionIndex - 1].Start + extractorResults[secondExtractionIndex - 1].Length - node.Start;
                    node.Text   = text.Substring(node.Start ?? 0, node.Length ?? 0);
                    node.Type   = extractorResults[firstExtractionIndex].Type;

                    // Add multiple duration type to extract result
                    string type = Constants.MultipleDuration_DateTime; // Default type
                    if (timeUnit == totalUnit)
                    {
                        type = Constants.MultipleDuration_Time;
                    }
                    else if (timeUnit == 0)
                    {
                        type = Constants.MultipleDuration_Date;
                    }

                    node.Data = type;

                    results.Add(node);

                    timeUnit  = 0;
                    totalUnit = 0;
                }
                else
                {
                    results.Add(extractorResults[firstExtractionIndex]);
                }

                // Add list of separate extractions to separateResults, so that there is a 1 to 1 correspondence
                // between results (list of merged extractions) and separateResults (list of unmerged extractions)
                separateResults.Add(separateList);

                firstExtractionIndex = secondExtractionIndex;
            }

            // If the first and last elements of a group of contiguous extractions are both preceded/followed by modifiers,
            // they should not be merged, e.g. "last 2 weeks and 3 days ago"
            for (int i = results.Count - 1; i >= 0; i--)
            {
                var start     = (int)results[i].Start;
                var end       = start + (int)results[i].Length;
                var beforeStr = text.Substring(0, start);
                var afterStr  = text.Substring(end);
                var beforeMod = this.config.ModPrefixRegex.MatchEnd(beforeStr, trim: true);
                var afterMod  = this.config.ModSuffixRegex.MatchBegin(afterStr, trim: true);
                if (beforeMod.Success && afterMod.Success)
                {
                    results.RemoveAt(i);
                    results.InsertRange(i, separateResults[i]);
                }
            }

            return(results);
        }