public void TestMergerTooFew() { int maxCount = 5; int costSlack = 3; MultiSearchMerger merger = new MultiSearchMerger(maxCount, costSlack); List <MultiSearchResult> results = new List <MultiSearchResult>(); string[][] surfaces1 = { new string[] { "a", "b" }, new string[] { "c", "d" }, new string[] { "e", "f" } }; int[] costs1 = { 1, 2, 5 }; results.Add(MakeResult(surfaces1, costs1)); string[][] surfaces2 = { new string[] { "a", "b" }, new string[] { "c", "d" } }; int[] costs2 = { 1, 2 }; results.Add(MakeResult(surfaces2, costs2)); string[][] surfaces3 = { new string[] { "a", "b" } }; int[] costs3 = { 5 }; results.Add(MakeResult(surfaces3, costs3)); MultiSearchResult mergedResult = merger.Merge(results); Assert.AreEqual(4, mergedResult.Count); Assert.AreEqual(7, mergedResult.GetCost(0)); Assert.AreEqual(8, mergedResult.GetCost(1)); Assert.AreEqual(8, mergedResult.GetCost(2)); Assert.AreEqual(9, mergedResult.GetCost(3)); Assert.AreEqual("a b a b a b", GetSpaceSeparatedTokens(mergedResult.GetTokenizedResult(0))); Assert.AreEqual("c d a b a b", GetSpaceSeparatedTokens(mergedResult.GetTokenizedResult(1))); Assert.AreEqual("a b c d a b", GetSpaceSeparatedTokens(mergedResult.GetTokenizedResult(2))); Assert.AreEqual("c d c d a b", GetSpaceSeparatedTokens(mergedResult.GetTokenizedResult(3))); }
/** * Tokenizes the provided text and returns up to maxCount lists of tokens with various feature information. * Each list corresponds to a possible tokenization with cost at most OPT + costSlack, where OPT is the optimal solution. * <p> * This method is thread safe * * @param text text to tokenize * @param maxCount maximum number of different tokenizations * @param costSlack maximum cost slack of a tokenization * @param <T> token type * @return list of Token, not null */ protected List <LinkedList <T> > CreateMultiTokenList(string text, int maxCount, int costSlack) { if (!split) { return(ConvertMultiSearchResultToList(CreateMultiSearchResult(text, maxCount, costSlack))); } List <int> splitPositions = GetSplitPositions(text); if (splitPositions.Count == 0) { return(ConvertMultiSearchResultToList(CreateMultiSearchResult(text, maxCount, costSlack))); } List <MultiSearchResult> results = new List <MultiSearchResult>(); int offset = 0; foreach (int position in splitPositions) { results.Add(CreateMultiSearchResult(text.Substring(offset, position + 1 - offset), maxCount, costSlack)); offset = position + 1; } if (offset < text.Length) { results.Add(CreateMultiSearchResult(text.Substring(offset), maxCount, costSlack)); } MultiSearchMerger merger = new MultiSearchMerger(maxCount, costSlack); MultiSearchResult mergedResult = merger.Merge(results); return(ConvertMultiSearchResultToList(mergedResult)); }
public void TestMergerTooFew() { const int maxCount = 5; const int costSlack = 3; var merger = new MultiSearchMerger(maxCount, costSlack); var results = new List <MultiSearchResult>(); var surfaces1 = new string[][] { new string[] { "a", "b" }, new string[] { "c", "d" }, new string[] { "e", "f" } }; var costs1 = new int[] { 1, 2, 5 }; results.Add(MakeResult(surfaces1, costs1)); var surfaces2 = new string[][] { new string[] { "a", "b" }, new string[] { "c", "d" } }; var costs2 = new int[] { 1, 2 }; results.Add(MakeResult(surfaces2, costs2)); var surfaces3 = new string[][] { new string[] { "a", "b" } }; var costs3 = new int[] { 5 }; results.Add(MakeResult(surfaces3, costs3)); MultiSearchResult mergedResult = merger.Merge(results); mergedResult.Count.Is(4); mergedResult.GetCost(0).Is(7); mergedResult.GetCost(1).Is(8); mergedResult.GetCost(2).Is(8); mergedResult.GetCost(3).Is(9); GetSpaceSeparatedTokens(mergedResult.GetTokenizedResult(0)).Is("a b a b a b"); GetSpaceSeparatedTokens(mergedResult.GetTokenizedResult(1)).Is("c d a b a b"); GetSpaceSeparatedTokens(mergedResult.GetTokenizedResult(2)).Is("a b c d a b"); GetSpaceSeparatedTokens(mergedResult.GetTokenizedResult(3)).Is("c d c d a b"); }