public static void RemoveDuplicatedTokens(ArrayList<ArrayList<QueryToken>> disjuncts) { IEqualityComparer<string> tokenComparer = new DefaultTokenComparer(DefaultTokenComparer.Options.ConvertUmlauts | DefaultTokenComparer.Options.RemoveDiacritics); for (int i = 0; i < disjuncts.Count(); ++i) { ArrayList<int> tokensToRemove = new ArrayList<int>(); for (int j = 0; j < disjuncts[i].Count(); ++j) { if (!tokensToRemove.Contains(j)) { for (int k = j + 1; k < disjuncts[i].Count(); ++k) { if (tokenComparer.Equals(disjuncts[i][j].ToString(), disjuncts[i][k].ToString())) tokensToRemove.Add(k); } } } tokensToRemove.Sort(); tokensToRemove.Reverse(); foreach (var toRemove in tokensToRemove.Distinct()) { disjuncts[i].RemoveAt(toRemove); } } }
public void TestWorkerWithRowDeserializedModeAndBytesSerializedMode() { Process worker; var CSharpRDD_SocketServer = CreateServer(out worker); const int expectedCount = 5; using (var serverSocket = CSharpRDD_SocketServer.Accept()) using (var s = serverSocket.GetStream()) { WritePayloadHeaderToWorker(s); byte[] commandWithRowDeserializeMode = SparkContext.BuildCommand(new CSharpWorkerFunc((pid, iter) => iter), SerializedMode.Row); SerDe.Write(s, commandWithRowDeserializeMode.Length); SerDe.Write(s, commandWithRowDeserializeMode); new StructTypePickler().Register(); new RowPickler().Register(); Pickler pickler = new Pickler(); for (int i = 0; i < expectedCount; i++) { byte[] pickleBytes = pickler.dumps(new[] { RowHelper.BuildRowForBasicSchema(i) }); SerDe.Write(s, pickleBytes.Length); SerDe.Write(s, pickleBytes); } SerDe.Write(s, (int)SpecialLengths.END_OF_DATA_SECTION); SerDe.Write(s, (int)SpecialLengths.END_OF_STREAM); s.Flush(); int count = 0; var formatter = new BinaryFormatter(); foreach (var bytes in ReadWorker(s)) { var ms = new MemoryStream(bytes); var rows = new ArrayList { formatter.Deserialize(ms) }.Cast<Row>().ToArray(); Assert.AreEqual(1, rows.Count()); Assert.AreEqual(count, rows[0].Get("age")); count++; } Assert.AreEqual(expectedCount, count); } AssertWorker(worker); CSharpRDD_SocketServer.Close(); }
public static void RemoveDuplicatedDisjuncts(ArrayList<ArrayList<QueryToken>> disjuncts) { IEqualityComparer<string> tokenComparer = new DefaultTokenComparer(DefaultTokenComparer.Options.ConvertUmlauts | DefaultTokenComparer.Options.RemoveDiacritics); ArrayList<int> disjunctsToRemove = new ArrayList<int>(); for (int i = 0; i < disjuncts.Count(); ++i) for (int j = i + 1; j < disjuncts.Count(); ++j) { if (ContainsTokenSubset(disjuncts.ElementAt(j), disjuncts.ElementAt(i))) { disjunctsToRemove.Add(i); break; } } disjunctsToRemove.Sort(); disjunctsToRemove.Reverse(); foreach (var toRemove in disjunctsToRemove.Distinct()) disjuncts.RemoveAt(toRemove); }
private static void ExtractUniqueQueryTokensBasedOnOccurance(ArrayList<ArrayList<QueryToken>> disjuncts, ArrayList<ArrayList<QueryToken>> finalQueryList, double twitterProfilerFrequencyThreshold, ArrayList<ArrayList<QueryToken>> subsetsGrouped, ArrayList<int> subsetsGroupedOccurance, int maxOccurance) { var subsetsWithMaxOccurance = subsetsGrouped .Where((x, idx) => subsetsGroupedOccurance[idx] == maxOccurance) .OrderBy(x => x.Count) .ToList(); // remove subsets which are contained in other ones ArrayList<int> subsetsToRemove = new ArrayList<int>(); for (int i = 0; i < subsetsWithMaxOccurance.Count(); ++i) { for (int j = i + 1; j < subsetsWithMaxOccurance.Count(); ++j) { if (ContainsTokenSubset(subsetsWithMaxOccurance[i], subsetsWithMaxOccurance[j])) { subsetsToRemove.Add(i); break; } } } subsetsToRemove.Sort(); subsetsToRemove.Reverse(); foreach (var toRemove in subsetsToRemove.Distinct()) { subsetsWithMaxOccurance.RemoveAt(toRemove); } subsetsToRemove.Clear(); if (maxOccurance > 1) // perform twitter profiling { for (int i = 0; i < subsetsWithMaxOccurance.Count(); ++i) { //twitter profiling string searchQuery = TwitterProfiler.QueryTokensToStringConverter(subsetsWithMaxOccurance[i]); double twitterProfilerFrequency = TwitterProfiler.ProfileFrequency(searchQuery); if (Math.Round(twitterProfilerFrequency) == Convert.ToInt32(TwitterProfiler.InvalidReturns.Exception)) { Console.WriteLine("\nError during twitter profiling: {0}!", TwitterProfiler.GetExceptionError()); System.Environment.Exit(1); } if (twitterProfilerFrequency < twitterProfilerFrequencyThreshold) { int disjunctsCountBeforeRemoval = disjuncts.Count(); // remove disjuncts from the original list that contain the selected subset QueryConverterUtils.RemoveDisjunctsContainingSubsets(disjuncts, subsetsWithMaxOccurance[i]); if (disjuncts.Count() < disjunctsCountBeforeRemoval) finalQueryList.Add(subsetsWithMaxOccurance[i]); if (disjuncts.Count() == 0) return; else { if (i >= (subsetsWithMaxOccurance.Count() - 1)) { // find new possible subsets QueryConverterUtils.ExtractUniqueQueryTokens(disjuncts, finalQueryList, twitterProfilerFrequencyThreshold); return; } } } else { if (i >= (subsetsWithMaxOccurance.Count() - 1)) { // select subsets with the first lower occurance ArrayList<int> lowerOccurances = subsetsGroupedOccurance.FindAll(x => x < maxOccurance); if (lowerOccurances.Count > 0) maxOccurance = lowerOccurances.Max(); else maxOccurance = 0; if (maxOccurance < 1) return; QueryConverterUtils.ExtractUniqueQueryTokensBasedOnOccurance(disjuncts, finalQueryList, twitterProfilerFrequencyThreshold, subsetsGrouped, subsetsGroupedOccurance, maxOccurance); if (disjuncts.Count() == 0) return; } } } } else // if occurrence is 1 -> profiling not needed { foreach (var item in subsetsWithMaxOccurance) { int disjunctsCountBeforeRemoval = disjuncts.Count(); // remove disjuncts from the original list that contain the selected subset QueryConverterUtils.RemoveDisjunctsContainingSubsets(disjuncts, item); if (disjuncts.Count() < disjunctsCountBeforeRemoval) finalQueryList.Add(item); if (disjuncts.Count() == 0) return; } // find new possible subsets QueryConverterUtils.ExtractUniqueQueryTokens(disjuncts, finalQueryList, twitterProfilerFrequencyThreshold); return; } }
private static void RemoveDisjunctsContainingSubsets(ArrayList<ArrayList<QueryToken>> disjuncts, ArrayList<QueryToken> subset) { ArrayList<int> disjunctsToRemove = new ArrayList<int>(); for (int i = 0; i < disjuncts.Count(); ++i) { if (ContainsTokenSubset(subset, disjuncts.ElementAt(i))) disjunctsToRemove.Add(i); } disjunctsToRemove.Sort(); disjunctsToRemove.Reverse(); foreach (var toRemove in disjunctsToRemove.Distinct()) { disjuncts.RemoveAt(toRemove); } }
public static void SortDisjunctSubsets(ArrayList<ArrayList<QueryToken>> disjuncts) { for (int i = 0; i < disjuncts.Count(); ++i) { //presort the disjunct to get the same order of terms in all disjuncts disjuncts[i] = new ArrayList<QueryToken>(disjuncts[i].OrderBy(token => token.ToString())); } }
public static void ConvertUmlauts(ArrayList<ArrayList<QueryToken>> disjuncts) { ArrayList<int> disjunctsToConvert = new ArrayList<int>(); for (int i = 0; i < disjuncts.Count(); ++i) { foreach (var token in disjuncts[i]) { String tokenString = token.ToString(); if (tokenString.Contains('ö') || tokenString.Contains('ü') || tokenString.Contains('ä') || tokenString.Contains('Ö') || tokenString.Contains('Ü') || tokenString.Contains('Ä') || tokenString.Contains('ß')) { disjunctsToConvert.Add(i); break; } } } disjunctsToConvert.Sort(); disjunctsToConvert.Reverse(); foreach (var toConvert in disjunctsToConvert.Distinct()) disjuncts.AddRange(QueryConverterUtils.ConvertSingleDisjunctUmlauts(disjuncts[toConvert])); }