public List <ExtractResult> Extract(string source) { List <ExtractResult> ers; // Only merge currency's compound units for now. if (config.ExtractType.Equals(Constants.SYS_UNIT_CURRENCY)) { ers = MergeCompoundUnits(source); } else { ers = new NumberWithUnitExtractor(config).Extract(source); } return(ers); }
private List <ExtractResult> MergeCompoundUnits(string source) { var result = new List <ExtractResult>(); var ers = new NumberWithUnitExtractor(config).Extract(source); MergePureNumber(source, ers); if (ers.Count == 0) { return(result); } var groups = new int[ers.Count]; groups[0] = 0; for (var idx = 0; idx < ers.Count - 1; idx++) { if (ers[idx].Type != ers[idx + 1].Type && !ers[idx].Type.Equals(Constants.SYS_NUM) && !ers[idx + 1].Type.Equals(Constants.SYS_NUM)) { continue; } if (ers[idx].Data is ExtractResult er && !er.Data.ToString().StartsWith("Integer")) { groups[idx + 1] = groups[idx] + 1; continue; } var middleBegin = ers[idx].Start + ers[idx].Length ?? 0; var middleEnd = ers[idx + 1].Start ?? 0; var middleStr = source.Substring(middleBegin, middleEnd - middleBegin).Trim().ToLowerInvariant(); // Separated by whitespace if (string.IsNullOrEmpty(middleStr)) { groups[idx + 1] = groups[idx]; continue; } // Separated by connectors var match = config.CompoundUnitConnectorRegex.Match(middleStr); if (match.Success && match.Index == 0 && match.Length == middleStr.Length) { groups[idx + 1] = groups[idx]; } else { groups[idx + 1] = groups[idx] + 1; } } for (var idx = 0; idx < ers.Count; idx++) { if (idx == 0 || groups[idx] != groups[idx - 1]) { var tmpExtractResult = ers[idx]; tmpExtractResult.Data = new List <ExtractResult> { new ExtractResult { Data = ers[idx].Data, Length = ers[idx].Length, Start = ers[idx].Start, Text = ers[idx].Text, Type = ers[idx].Type } }; result.Add(tmpExtractResult); } // Reduce extract results in same group if (idx + 1 < ers.Count && groups[idx + 1] == groups[idx]) { var group = groups[idx]; var periodBegin = result[group].Start ?? 0; var periodEnd = (ers[idx + 1].Start ?? 0) + (ers[idx + 1].Length ?? 0); result[group].Length = periodEnd - periodBegin; result[group].Text = source.Substring(periodBegin, periodEnd - periodBegin); result[group].Type = Constants.SYS_UNIT_CURRENCY; (result[group].Data as List <ExtractResult>)?.Add(ers[idx + 1]); } } for (var idx = 0; idx < result.Count; idx++) { var innerData = result[idx].Data as List <ExtractResult>; if (innerData?.Count == 1) { result[idx] = innerData[0]; } } result.RemoveAll(o => o.Type == Constants.SYS_NUM); return(result); }
public BaseMergedUnitExtractor(INumberWithUnitExtractorConfiguration config) { this.config = config; this.numberWithUnitExtractor = new NumberWithUnitExtractor(config); }