static void Main(string[] args) { if (args.Length != 1) { Console.Error.WriteLine("Usage: ConvertLinkGraphToTemporalSHSFormat <LinkGraph.gz>"); } else { try { using (var rd = new StreamReader(new GZipStream(new FileStream(args[0], FileMode.Open, FileAccess.Read), CompressionMode.Decompress))) using (var wr = new StreamWriter(new GZipStream(new FileStream("url_" + args[0], FileMode.OpenOrCreate, FileAccess.Write), CompressionMode.Compress))) using (var sorter = new DiskSorter <URLData>(new URLData.Comparer(), URLData.Write, URLData.Read, 1 << 25)) { char[] delimiter = { ' ', '\t' }; string line; while ((line = rd.ReadLine()) != null) { string[] links = line.Split(delimiter, StringSplitOptions.RemoveEmptyEntries); sorter.Add(new URLData { url = links[0] }); sorter.Add(new URLData { url = links[1] }); } sorter.Sort(); string lastURL = ""; while (!sorter.AtEnd()) { string currenURL = sorter.Get().url; if (currenURL.CompareTo(lastURL) != 0) { wr.WriteLine(currenURL); lastURL = currenURL; } } } } catch (Exception e) { Console.Error.WriteLine("Exception {0}", e.Message); } finally { Console.WriteLine("Over !!!"); } } }
public static void Main(string[] args) { if (args.Length != 3) { Console.Error.WriteLine("Usage: ConvertCwGraphToShsInput <in-nodes.gz> <in-graph.gz> <out.bin.gz>"); } else { var sw = System.Diagnostics.Stopwatch.StartNew(); using (var lps = new DiskSorter <IdxIdx>(new IdxIdx.Comparer(), IdxIdx.Write, IdxIdx.Read, 1 << 25)) { using (var rd = new StreamReader(new GZipStream(new FileStream(args[1], FileMode.Open, FileAccess.Read), CompressionMode.Decompress))) { var numExpected = long.Parse(rd.ReadLine()); if (numExpected == 4780950903) { numExpected += 8; // Hack: Deal with CW09 Cat A graph file being flawed } Console.WriteLine("Expecting {0} body lines", numExpected); long srcIdx = 0; long numLinks = 0; for (; ;) { var line = rd.ReadLine(); if (line == null) { break; } var outLinks = line.Split(Sep, StringSplitOptions.RemoveEmptyEntries); var deg = outLinks.Length; for (int i = 0; i < deg; i++) { var link = long.Parse(outLinks[i]); if (link >= numExpected) { throw new Exception(string.Format("Line {0} has out-of-range link {1}", srcIdx, link)); } lps.Add(new IdxIdx { srcIdx = srcIdx, dstIdx = link }); numLinks++; } srcIdx++; } Console.Error.WriteLine("Read graph file, found {0} nodes and {1} edges.", srcIdx, numLinks); } lps.Sort(); using (var lss = new DiskSorter <IdxUrl>(new IdxUrl.Comparer(), IdxUrl.Write, IdxUrl.Read, 1 << 20)) { // Next, read the nodes file and the destination-sorted graph file in lock-step, and write // it back out, having replaced the second index by the URL using (var nodeRd = new BinaryReader(new GZipStream(new FileStream(args[0], FileMode.Open, FileAccess.Read), CompressionMode.Decompress))) { long urlIdx = -1; string url = null; while (!lps.AtEnd()) { var pair = lps.Get(); while (urlIdx < pair.dstIdx) { var line = nodeRd.ReadLine(); if (line == null) { throw new Exception(string.Format("Unexpected end of {0}", args[0])); } url = NormalizeUrlSchemeAndHost(line); urlIdx++; } lss.Add(new IdxUrl { srcIdx = pair.srcIdx, dstUrl = url }); } } lss.Sort(); // Finally, read the nodes file and the destination-sorted graph file in lock-step, // and write it back out in the format expected by ShsBuilder. using (var nodeRd = new BinaryReader(new GZipStream(new FileStream(args[0], FileMode.Open, FileAccess.Read), CompressionMode.Decompress))) { using (var wr = new BinaryWriter(new GZipStream(new FileStream(args[2], FileMode.OpenOrCreate, FileAccess.Write), CompressionMode.Compress))) { long urlIdx = 0; long lstIdx = -1; var list = new List <string>(); while (!lss.AtEnd()) { var pair = lss.Get(); if (lstIdx == -1) { lstIdx = pair.srcIdx; } if (lstIdx == pair.srcIdx) { list.Add(pair.dstUrl); } else { while (urlIdx <= lstIdx) { var line = nodeRd.ReadLine(); if (line == null) { throw new Exception(string.Format("Unexpected end of {0}", args[0])); } var srcUrl = NormalizeUrlSchemeAndHost(line); wr.Write(srcUrl); if (urlIdx != lstIdx) { wr.Write(0); } else { wr.Write(list.Count); foreach (var x in list) { wr.Write(x); } } urlIdx++; } list.Clear(); list.Add(pair.dstUrl); lstIdx = pair.srcIdx; } } while (true) { var line = nodeRd.ReadLine(); if (line == null) { break; } var srcUrl = NormalizeUrlSchemeAndHost(line); wr.Write(srcUrl); if (urlIdx != lstIdx) { wr.Write(0); } else { wr.Write(list.Count); foreach (var x in list) { wr.Write(x); } } urlIdx++; } } } } } Console.Error.WriteLine("Wrote out result file; starting sanity check."); using (var rd = new BinaryReader(new GZipStream(new FileStream(args[2], FileMode.Open, FileAccess.Read), CompressionMode.Decompress))) { long pageCnt = 0; long linkCnt = 0; for (; ;) { try { var srcUrl = rd.ReadString(); pageCnt++; var numLinks = rd.ReadInt32(); for (int i = 0; i < numLinks; i++) { var dstUrl = rd.ReadString(); linkCnt++; } } catch (EndOfStreamException) { break; } } Console.Error.WriteLine("Found {0} pages and {1} links. Job took {2} seconds.", pageCnt, linkCnt, 0.001 * sw.ElapsedMilliseconds); } } }
public static void Main(string[] args) { if (args.Length != 3) { Console.Error.WriteLine("Usage: ConvertLinkGraphToTemporalAdjList <LinkGraph.gz> <out.gz> <in.gz>"); } else { Console.WriteLine("Starting..."); var sw = System.Diagnostics.Stopwatch.StartNew(); var d0 = Convert.ToDateTime("1998-01-01"); DateTime d = d0; try { using (var rd = new StreamReader(new GZipStream(new FileStream(args[0], FileMode.Open, FileAccess.Read), CompressionMode.Decompress))) { using (var wr = new StreamWriter(new GZipStream(new FileStream(args[1], FileMode.OpenOrCreate, FileAccess.Write), CompressionMode.Compress))) { using (var sorter = new DiskSorter <AdjDstURL>(new AdjDstURL.Comparer(), AdjDstURL.Write, AdjDstURL.Read, 1 << 25)) { string line = ""; string[] links = null;; while ((line = rd.ReadLine()) != null) { links = line.Split(Sep, StringSplitOptions.RemoveEmptyEntries); sorter.Add(new AdjDstURL { srcIdx = links[0] + " " + links[2] + " " + links[1] }); } sorter.Sort(); line = sorter.Get().srcIdx; links = line.Split(Sep, StringSplitOptions.RemoveEmptyEntries); var current = links[0] + " " + links[1]; while (!sorter.AtEnd()) { wr.Write(current + " "); while ((links[0] + " " + links[1]).CompareTo(current) == 0 && !sorter.AtEnd()) { wr.Write(links[2] + " "); line = sorter.Get().srcIdx; links = line.Split(Sep, StringSplitOptions.RemoveEmptyEntries); } wr.WriteLine(); current = links[0] + " " + links[1]; } Console.WriteLine("Finished forward links ..."); } } } using (var rd = new StreamReader(new GZipStream(new FileStream(args[0], FileMode.Open, FileAccess.Read), CompressionMode.Decompress))) { using (var bwr = new StreamWriter(new GZipStream(new FileStream(args[2], FileMode.OpenOrCreate, FileAccess.Write), CompressionMode.Compress))) { using (var bwd_sorter = new DiskSorter <AdjDstURL>(new AdjDstURL.Comparer(), AdjDstURL.Write, AdjDstURL.Read, 1 << 25)) { string line = ""; string[] links = null;; while ((line = rd.ReadLine()) != null) { links = line.Split(Sep, StringSplitOptions.RemoveEmptyEntries); bwd_sorter.Add(new AdjDstURL { srcIdx = links[1] + " " + links[2] + " " + links[0] }); } bwd_sorter.Sort(); line = bwd_sorter.Get().srcIdx; links = line.Split(Sep, StringSplitOptions.RemoveEmptyEntries); var current = links[0] + " " + links[1]; while (!bwd_sorter.AtEnd()) { bwr.Write(current + " "); while ((links[0] + " " + links[1]).CompareTo(current) == 0 && !bwd_sorter.AtEnd()) { bwr.Write(links[2] + " "); line = bwd_sorter.Get().srcIdx; links = line.Split(Sep, StringSplitOptions.RemoveEmptyEntries); } bwr.WriteLine(); current = links[0] + " " + links[1]; } } } } } catch (Exception) { } finally { } } }
public static KeyValuePair <long, long> ConvertSURT2AdjListByUid(Store store, string args0, string args1) { char[] Sep = { ' ', '\t' }; var d0 = Convert.ToDateTime("1998-01-01"); DateTime d = d0; long num_links = 0; long fwd_num_list = 0; long bwd_num_list = 0; string line = ""; int NUM_CACHE = 100000; string[] links = new string[3]; string[] cache = new string[2 * NUM_CACHE]; long[] temporal = new long[NUM_CACHE]; int current_count = 0; long[] out_link; var bwd_sorter = new DiskSorter <TemporalNodeIdLinks>(new TemporalNodeIdLinks.Comparer(), TemporalNodeIdLinks.Write, TemporalNodeIdLinks.Read, 1 << 25); try { using (var rd = new StreamReader(new GZipStream(new FileStream(args0, FileMode.Open, FileAccess.Read), CompressionMode.Decompress))) using (var wr = new StreamWriter(new GZipStream(new FileStream(args1, FileMode.OpenOrCreate, FileAccess.Write), CompressionMode.Compress))) using (var sorter = new DiskSorter <TemporalNodeIdLinks>(new TemporalNodeIdLinks.Comparer(), TemporalNodeIdLinks.Write, TemporalNodeIdLinks.Read, 1 << 25)) { while ((line = rd.ReadLine()) != null) { links = line.Split(Sep, StringSplitOptions.RemoveEmptyEntries); temporal[current_count] = (Convert.ToDateTime(links[2] + "-28") - d0).Days; cache[2 * current_count] = getNormalURLFormat(links[0]); cache[2 * current_count + 1] = getNormalURLFormat(links[1]); //if ((String.IsNullOrWhiteSpace(cache[2 * current_count])) || (String.IsNullOrWhiteSpace(cache[2 * current_count + 1]))) //{ // Console.WriteLine("ConvertSURT2AdjListByUid Error {0}", cache[2 * current_count]); // Console.WriteLine("ConvertSURT2AdjListByUid Error {0}", cache[2 * current_count + 1]); //} if (current_count == (NUM_CACHE - 1)) { out_link = store.BatchedUrlToUid(cache); for (int i = 0; i < NUM_CACHE; i++) { if (out_link[2 * i] != -1 && out_link[2 * i + 1] != -1) { sorter.Add(new TemporalNodeIdLinks { source_node = out_link[2 * i], time = temporal[i], dest_node = out_link[2 * i + 1] }); bwd_sorter.Add(new TemporalNodeIdLinks { source_node = out_link[2 * i + 1], time = temporal[i], dest_node = out_link[2 * i] }); } } current_count = -1; Console.WriteLine("Next 1mil links finished..."); } current_count++; } if (current_count > 0) { current_count--; string[] restUid = new string[current_count]; Array.Copy(cache, restUid, current_count); out_link = store.BatchedUrlToUid(restUid); for (int i = 0; i < current_count / 2; i++) { if (out_link[2 * i] != -1 && out_link[2 * i + 1] != -1) { sorter.Add(new TemporalNodeIdLinks { source_node = out_link[2 * i], time = temporal[i], dest_node = out_link[2 * i + 1] }); bwd_sorter.Add(new TemporalNodeIdLinks { source_node = out_link[2 * i + 1], time = temporal[i], dest_node = out_link[2 * i] }); } } } sorter.Sort(); num_links = sorter.Total; Console.WriteLine("{0} links", num_links); TemporalNodeIdLinks temporalFwdLink = sorter.Get(); var current = temporalFwdLink; wr.WriteLine(num_links); while (!sorter.AtEnd()) { wr.Write(current.source_node + "\t" + current.time + "\t"); while ((current.source_node == temporalFwdLink.source_node) && (current.time == temporalFwdLink.time) && !sorter.AtEnd()) { wr.Write(temporalFwdLink.dest_node + "\t"); temporalFwdLink = sorter.Get(); } // Consider to change to UUID here wr.WriteLine(); current = temporalFwdLink; fwd_num_list++; } Console.WriteLine("Finished build links to adjacency list by string..."); } using (var wr = new StreamWriter(new GZipStream(new FileStream("Bwd_" + args1, FileMode.OpenOrCreate, FileAccess.Write), CompressionMode.Compress))) { bwd_sorter.Sort(); Console.WriteLine("{0} links", bwd_sorter.Total); TemporalNodeIdLinks temporalFwdLink = bwd_sorter.Get(); TemporalNodeIdLinks current = temporalFwdLink; wr.WriteLine(num_links); while (!bwd_sorter.AtEnd()) { wr.Write(current.source_node + "\t" + current.time + "\t"); while ((current.source_node == temporalFwdLink.source_node) && (current.time == temporalFwdLink.time) && !bwd_sorter.AtEnd()) { wr.Write(temporalFwdLink.dest_node + "\t"); temporalFwdLink = bwd_sorter.Get(); } // Consider to change to UUID here wr.WriteLine(); current = temporalFwdLink; bwd_num_list++; } } } catch (Exception e) { Console.Error.WriteLine(e.Message); Console.Error.WriteLine(e.Source); Console.Error.WriteLine(e.StackTrace); Console.Error.WriteLine(line); } finally { } return(new KeyValuePair <long, long>(fwd_num_list, bwd_num_list)); }
public static KeyValuePair <long, long> AttempConvertTempLinks2AdjListByUid(Store store, string args0, string args1) { char[] Sep = { ' ', '\t' }; var d0 = Convert.ToDateTime("1998-01-01"); DateTime d = d0; long num_links = 0; long fwd_num_list = 0; long bwd_num_list = 0; string line = ""; string[] links = new string[3]; long temporal; SortedDictionary <string, long> mapped = buildMapping(store, args0); var bwd_sorter = new DiskSorter <TemporalNodeIdLinks>(new TemporalNodeIdLinks.Comparer(), TemporalNodeIdLinks.Write, TemporalNodeIdLinks.Read, 1 << 25); try { using (var rd = new StreamReader(new GZipStream(new FileStream(args0, FileMode.Open, FileAccess.Read), CompressionMode.Decompress))) using (var wr = new StreamWriter(new GZipStream(new FileStream(args1, FileMode.OpenOrCreate, FileAccess.Write), CompressionMode.Compress))) using (var sorter = new DiskSorter <TemporalNodeIdLinks>(new TemporalNodeIdLinks.Comparer(), TemporalNodeIdLinks.Write, TemporalNodeIdLinks.Read, 1 << 25)) { while ((line = rd.ReadLine()) != null) { links = line.Split(Sep, StringSplitOptions.RemoveEmptyEntries); for (int t = 3; t < links.Length - 1; t++) { temporal = (Convert.ToDateTime(links[t]) - d0).Days; long src; mapped.TryGetValue(links[0], out src); long dest; mapped.TryGetValue(links[0], out dest); sorter.Add(new TemporalNodeIdLinks { source_node = src, time = temporal, dest_node = dest }); bwd_sorter.Add(new TemporalNodeIdLinks { source_node = dest, time = temporal, dest_node = src }); } } sorter.Sort(); num_links = sorter.Total; Console.WriteLine("{0} links", num_links); TemporalNodeIdLinks temporalFwdLink = sorter.Get(); var current = temporalFwdLink; wr.WriteLine(num_links); while (!sorter.AtEnd()) { wr.Write(current.source_node + "\t" + current.time + "\t"); while ((current.source_node == temporalFwdLink.source_node) && (current.time == temporalFwdLink.time) && !sorter.AtEnd()) { wr.Write(temporalFwdLink.dest_node + "\t"); temporalFwdLink = sorter.Get(); } // Consider to change to UUID here wr.WriteLine(); current = temporalFwdLink; fwd_num_list++; } Console.WriteLine("Finished build links to adjacency list by string..."); } using (var wr = new StreamWriter(new GZipStream(new FileStream("Bwd_" + args1, FileMode.OpenOrCreate, FileAccess.Write), CompressionMode.Compress))) { bwd_sorter.Sort(); Console.WriteLine("{0} links", bwd_sorter.Total); TemporalNodeIdLinks temporalFwdLink = bwd_sorter.Get(); TemporalNodeIdLinks current = temporalFwdLink; wr.WriteLine(num_links); while (!bwd_sorter.AtEnd()) { wr.Write(current.source_node + "\t" + current.time + "\t"); while ((current.source_node == temporalFwdLink.source_node) && (current.time == temporalFwdLink.time) && !bwd_sorter.AtEnd()) { wr.Write(temporalFwdLink.dest_node + "\t"); temporalFwdLink = bwd_sorter.Get(); } // Consider to change to UUID here wr.WriteLine(); current = temporalFwdLink; bwd_num_list++; } } } catch (Exception e) { Console.Error.WriteLine(e.Message); Console.Error.WriteLine(e.Source); Console.Error.WriteLine(e.StackTrace); Console.Error.WriteLine(line); } finally { } return(new KeyValuePair <long, long>(fwd_num_list, bwd_num_list)); }
public static KeyValuePair <long, long> ConvertTempLinks2AdjListByUid(Store store, string args0, string args1) { char[] Sep = { ' ', '\t' }; var d0 = Convert.ToDateTime("1998-01-01"); DateTime d = d0; long num_links = 0; long fwd_num_list = 0; long bwd_num_list = 0; string line = ""; int NUM_CACHE = 10000000; string[] links = new string[3]; string[] cache = new string[2 * NUM_CACHE]; long[][] temporal = new long[NUM_CACHE][]; int current_count = 0; SortedSet <string> url_set = new SortedSet <string>(); SortedDictionary <string, long> mapped = new SortedDictionary <string, long>(); var bwd_sorter = new DiskSorter <TemporalNodeIdLinks>(new TemporalNodeIdLinks.Comparer(), TemporalNodeIdLinks.Write, TemporalNodeIdLinks.Read, 1 << 25); try { using (var rd = new StreamReader(new GZipStream(new FileStream(args0, FileMode.Open, FileAccess.Read), CompressionMode.Decompress))) using (var wr = new StreamWriter(new GZipStream(new FileStream(args1, FileMode.OpenOrCreate, FileAccess.Write), CompressionMode.Compress))) using (var sorter = new DiskSorter <TemporalNodeIdLinks>(new TemporalNodeIdLinks.Comparer(), TemporalNodeIdLinks.Write, TemporalNodeIdLinks.Read, 1 << 25)) { while ((line = rd.ReadLine()) != null) { links = line.Split(Sep, StringSplitOptions.RemoveEmptyEntries); temporal[current_count] = new long[line.Length - 3]; temporal[current_count][0] = (Convert.ToDateTime(links[2]) - d0).Days; for (int t = 3; t < links.Length - 1; t++) { temporal[current_count][t - 2] = (Convert.ToDateTime(links[t]) - d0).Days; } cache[2 * current_count] = links[0]; cache[2 * current_count + 1] = links[1]; url_set.Add(links[0]); url_set.Add(links[1]); if (current_count == (NUM_CACHE - 1)) { string[] unique_urls = new string[url_set.Count]; url_set.CopyTo(unique_urls); var uids = store.BatchedUrlToUid(unique_urls); for (int i = 0; i < unique_urls.Length; i++) { try { mapped.Add(unique_urls[i], uids[i]); } catch (Exception) { Console.WriteLine("Error mapping..."); } } for (int i = 0; i < NUM_CACHE; i++) { long mSource, mDest; mapped.TryGetValue(cache[2 * i], out mSource); mapped.TryGetValue(cache[2 * i + 1], out mDest); if (mSource != -1 && mDest != -1) { for (int t = 0; t < temporal[i].Length; t++) { sorter.Add(new TemporalNodeIdLinks { source_node = mSource, time = temporal[i][t], dest_node = mDest }); bwd_sorter.Add(new TemporalNodeIdLinks { source_node = mDest, time = temporal[i][t], dest_node = mSource }); } } } current_count = -1; url_set = new SortedSet <string>(); mapped = new SortedDictionary <string, long>(); Console.WriteLine("Next {0} links finished...", NUM_CACHE); } current_count++; } if (current_count > 0) { current_count--; string[] unique_urls = new string[url_set.Count]; url_set.CopyTo(unique_urls); var uids = store.BatchedUrlToUid(unique_urls); for (int i = 0; i < unique_urls.Length; i++) { try { mapped.Add(unique_urls[i], uids[i]); } catch (Exception) { Console.WriteLine("Error mapping..."); } } for (int i = 0; i < current_count; i++) { long mSource, mDest; mapped.TryGetValue(cache[2 * i], out mSource); mapped.TryGetValue(cache[2 * i + 1], out mDest); if (mSource != -1 && mDest != -1) { for (int t = 0; t < temporal[i].Length; t++) { sorter.Add(new TemporalNodeIdLinks { source_node = mSource, time = temporal[i][t], dest_node = mDest }); bwd_sorter.Add(new TemporalNodeIdLinks { source_node = mDest, time = temporal[i][t], dest_node = mSource }); } } } Console.WriteLine("Next {0} links finished...", current_count); } sorter.Sort(); num_links = sorter.Total; Console.WriteLine("{0} links", num_links); TemporalNodeIdLinks temporalFwdLink = sorter.Get(); var current = temporalFwdLink; wr.WriteLine(num_links); while (!sorter.AtEnd()) { wr.Write(current.source_node + "\t" + current.time + "\t"); while ((current.source_node == temporalFwdLink.source_node) && (current.time == temporalFwdLink.time) && !sorter.AtEnd()) { wr.Write(temporalFwdLink.dest_node + "\t"); temporalFwdLink = sorter.Get(); } // Consider to change to UUID here wr.WriteLine(); current = temporalFwdLink; fwd_num_list++; } Console.WriteLine("Finished build links to adjacency list by string..."); } using (var wr = new StreamWriter(new GZipStream(new FileStream("Bwd_" + args1, FileMode.OpenOrCreate, FileAccess.Write), CompressionMode.Compress))) { bwd_sorter.Sort(); Console.WriteLine("{0} links", bwd_sorter.Total); TemporalNodeIdLinks temporalFwdLink = bwd_sorter.Get(); TemporalNodeIdLinks current = temporalFwdLink; wr.WriteLine(num_links); while (!bwd_sorter.AtEnd()) { wr.Write(current.source_node + "\t" + current.time + "\t"); while ((current.source_node == temporalFwdLink.source_node) && (current.time == temporalFwdLink.time) && !bwd_sorter.AtEnd()) { wr.Write(temporalFwdLink.dest_node + "\t"); temporalFwdLink = bwd_sorter.Get(); } // Consider to change to UUID here wr.WriteLine(); current = temporalFwdLink; bwd_num_list++; } } } catch (Exception e) { Console.Error.WriteLine(e.Message); Console.Error.WriteLine(e.Source); Console.Error.WriteLine(e.StackTrace); Console.Error.WriteLine(line); } finally { } return(new KeyValuePair <long, long>(fwd_num_list, bwd_num_list)); }