/// <summary> /// Read swf (header and tags), this is the only /// public method of <see cref="SwfDotNet.IO.SwfReader">SwfReader</see> /// with <see cref="SwfDotNet.IO.SwfReader.Close">Close</see> and /// <see cref="SwfDotNet.IO.SwfReader.ReadSwfHeader">ReadSwfHeader</see> methods. /// The returned <see cref="SwfDotNet.IO.Swf">Swf</see> object contains swf headers informations and the /// tags list. /// </summary> public Swf ReadSwf() { // compressed swf? if (br.PeekChar() == 'C') { Inflate(); } SwfHeader header = new SwfHeader(); header.ReadData(br); this.version = header.Version; tagList = new BaseTagCollection(); bool readEndTag = false; //necessary for the 1 more byte bug while (br.BaseStream.Position < br.BaseStream.Length && !readEndTag) { BaseTag b = SwfReader.ReadTag(this.version, this.br, this.tagList); if (b != null) { if (b is EndTag) { readEndTag = true; } tagList.Add(b); } } ; br.Close(); return(new Swf(header, tagList)); }
public static List <Tuple <int, BinarySearchSet <int> > > GetMatrix(String filename) { var watch = Stopwatch.StartNew(); var matrix = new List <Tuple <int, BinarySearchSet <int> > >(); using (var reader = new BufferedBinaryReader(filename)) { int lineCounter = 0; float length = reader.reader.BaseStream.Length / 100.0f; while (reader.PeekChar() > -1) { if (++lineCounter % 100000 == 0) { Console.Write(" \rRead: {0} %\r", (reader.reader.BaseStream.Position / length).ToString("0.000")); } int userId = reader.ReadInt32(); int nTerms = reader.ReadInt32(); var list = new List <int>(); for (int i = 0; i < nTerms; i++) { list.Add(reader.ReadInt32()); } matrix.Add(new Tuple <int, BinarySearchSet <int> >(userId, new BinarySearchSet <int>(list, Comparer <int> .Default))); } } watch.Stop(); Console.WriteLine("Matrix loaded after: {0}", watch.Elapsed); return(matrix); }
private void ReadData() { var path = PathResolver.UserMatrixOutputProcessed; _list = new List <Tuple <int, List <int> > >(); using (var reader = new BufferedBinaryReader(path)) { while (reader.PeekChar() > -1) { int currentUser = reader.ReadInt32(); var usersList = new List <int>(); for (int i = reader.ReadInt32(); i > 0; i--) { int userId = reader.ReadInt32(); usersList.Add(userId); } _list.Add(new Tuple <int, List <int> >(currentUser, usersList)); } } }
private static void getTopUrls(string input, string output) { List <Tuple <int, int> > allUrls = new List <Tuple <int, int> >(); Dictionary <int, int> urlsCount = new Dictionary <int, int>(); Stopwatch watch = new Stopwatch(); watch.Start(); using (BufferedBinaryReader reader = new BufferedBinaryReader(input)) { HashSet <int> processedQueries = new HashSet <int>(); while (reader.PeekChar() > -1) { byte type = reader.ReadByte(); int session = reader.ReadInt32(); switch (type) { case 0: // DAY reader.ReadInt32(); int user = reader.ReadInt32(); break; case 1: case 2: { // TIME reader.ReadInt32(); // SERPID reader.ReadInt32(); // QUERYID int query_id = reader.ReadInt32(); bool process = !processedQueries.Contains(query_id); if (process) { processedQueries.Add(query_id); } // TERMS int termsN = reader.ReadInt32(); for (int i = termsN; i > 0; i--) { int term = reader.ReadInt32(); } // URLS & DOMAINS for (int i = reader.ReadInt32(); i > 0; i--) { int url = reader.ReadInt32(); int domain = reader.ReadInt32(); if (!process) { continue; } if (urlsCount.ContainsKey(url)) { urlsCount[url]++; } else { urlsCount.Add(url, 1); } } break; } case 3: { // TIME reader.ReadInt32(); // SERPID reader.ReadInt32(); // URLS int url = reader.ReadInt32(); break; } } } } // przeniesienie z mapy do listy foreach (var element in urlsCount) { allUrls.Add(new Tuple <int, int>(element.Key, element.Value)); } urlsCount.Clear(); urlsCount = new Dictionary <int, int>(); GC.Collect(); watch.Stop(); Console.WriteLine("... zakończono po {0} ({1})", watch.ElapsedMilliseconds, allUrls.Count); watch.Restart(); // sortowanie po malejącej liczbie wystąpień allUrls.Sort((o1, o2) => { return(o2.Item2 - o1.Item2); }); watch.Stop(); Console.WriteLine("Zakończono sortowanie danych po " + watch.ElapsedMilliseconds); watch.Restart(); // zostawienie top 1000 urli if (allUrls.Count > 1000) { allUrls.RemoveRange(1000, allUrls.Count - 1000); } watch.Stop(); Console.WriteLine("Wyczyszczono liste po " + watch.ElapsedMilliseconds); watch.Restart(); using (StreamWriter writer = new StreamWriter(output)) { foreach (var element in allUrls) { writer.WriteLine(element.Item1 + "\t" + element.Item2); } } watch.Stop(); Console.WriteLine("Zakończono zapisywanie danych po " + watch.ElapsedMilliseconds); }
private static void getTermsFromTop(string logFilename, string topFilename, StreamWriter output) { const int N = 100; List <Tuple <int, int> > terms = new List <Tuple <int, int> >(); List <int>[] queries = new List <int> [N]; HashSet <int> processedQueries = new HashSet <int>(); for (int i = 0; i < N; i++) { queries[i] = new List <int>(); } using (StreamReader reader = new StreamReader(topFilename)) { string[] seps = new string[] { "\t" }; for (int i = 0; i < N; i++) { string line = reader.ReadLine(); string[] array = line.Split(seps, StringSplitOptions.None); terms.Add(new Tuple <int, int>(Int32.Parse(array[0]), Int32.Parse(array[1]))); } } List <int> sortedTerms = new List <int>(); foreach (var term in terms) { sortedTerms.Add(term.Item1); } sortedTerms.Sort(); using (BufferedBinaryReader reader = new BufferedBinaryReader(logFilename)) { HashSet <int> procesedQueries = new HashSet <int>(); while (reader.PeekChar() > -1) { byte type = reader.ReadByte(); int session = reader.ReadInt32(); switch (type) { case 0: // DAY reader.ReadInt32(); // USER reader.ReadInt32(); break; case 1: case 2: { // TIME reader.ReadInt32(); // SERPID reader.ReadInt32(); // QUERYID int query_id = reader.ReadInt32(); bool process = !processedQueries.Contains(query_id); if (process) { processedQueries.Add(query_id); } // TERMS int termsN = reader.ReadInt32(); for (int i = termsN; i > 0; i--) { int term = reader.ReadInt32(); if (!process) { continue; } for (int index = 0; index < sortedTerms.Count; index++) { if (sortedTerms[index] > term) { break; } if (sortedTerms[index] == term) { queries[index].Add(query_id); } } } // URLS & DOMAINS int n = reader.ReadInt32(); for (int i = n; i > 0; i--) { int url = reader.ReadInt32(); int domain = reader.ReadInt32(); } break; } case 3: { // TIME reader.ReadInt32(); // SERPID reader.ReadInt32(); // URLS int url = reader.ReadInt32(); break; } } } } { output.WriteLine("Top " + terms.Count + " terms:"); for (int i = 0; i < terms.Count; i++) { output.WriteLine(terms[i].Item1 + "\t" + terms[i].Item2); } for (int i = 0; i < terms.Count; i++) { output.WriteLine(); int index = sortedTerms.IndexOf(terms[i].Item1); queries[index].Sort(); output.WriteLine(sortedTerms[index] + ":"); foreach (var query in queries[index]) { output.WriteLine(query); } } output.WriteLine(); } }
public void CompareUsers() { double simSum = 0; int simCount = 0; //List<Tuple<UserId, List<Tuple<UserId, Similarity>>>> //matrix = new List<Tuple<int, List<Tuple<int, int>>>>(); var count = users.Count; var path = PathResolver.UserMatrixOutput; var path2 = PathResolver.UserMatrixOutputProcessed; using (var writer = new BinaryWriter(new FileStream(path, FileMode.CreateNew))) { for (int i = 0; i < count; i++) { var list = new List <Tuple <int, int> >(); for (int j = i + 1; j < count; j++) { var res = CompareTwoUsers(users[i], users[j]); if (res == 0) { continue; } list.Add(new Tuple <int, int>(j, res)); simSum += res; simCount++; } //write UserId writer.Write(list[i].Item1); writer.Write(list.Count); foreach (var element in list) { writer.Write(element.Item1); writer.Write(element.Item2); } //matrix.Add(new Tuple<int, List<Tuple<int, int>>>(i,list)); } } double minVal = 1.5 /* ?? */ * simSum / simCount; using (var writer = new BinaryWriter(new FileStream(path2, FileMode.CreateNew))) using (var reader = new BufferedBinaryReader(path)) { while (reader.PeekChar() > -1) { int currentUser = reader.ReadInt32(); var usersList = new List <int>(); for (int i = reader.ReadInt32(); i > 0; i--) { int userId = reader.ReadInt32(); int sim = reader.ReadInt32(); if (sim > minVal) { usersList.Add(userId); } } if (usersList.Count > 0) { writer.Write(currentUser); writer.Write(usersList.Count); foreach (var userId in usersList) { writer.Write(userId); } } } } }
public void Read() { var binaryReader = new BufferedBinaryReader(_binaryReader); Metadata metadata = new Metadata(); QueryAction queryAction = new QueryAction(); Click click = new Click(); float length = binaryReader.reader.BaseStream.Length / 100.0f; int lineCounter = 0; _reader.onBeginRead(); int type = binaryReader.PeekChar(); while (type > -1) { if (++lineCounter % 100000 == 0) { Console.Write(" \rRead: {0} %\r", (binaryReader.reader.BaseStream.Position / length).ToString("0.000")); if (lineCounter % 2000000 == 0) { GC.Collect(); } } switch (type) { case 0: { metadata.type = binaryReader.ReadByte(); metadata.sessionId = binaryReader.ReadInt32(); metadata.day = binaryReader.ReadInt32(); metadata.userId = binaryReader.ReadInt32(); _reader.onMetadata(metadata); break; } case 1: case 2: { queryAction.type = binaryReader.ReadByte(); queryAction.sessionId = binaryReader.ReadInt32(); queryAction.time = binaryReader.ReadInt32(); queryAction.serpid = binaryReader.ReadInt32(); queryAction.queryId = binaryReader.ReadInt32(); int nTerms = binaryReader.ReadInt32(); queryAction.nTerms = nTerms; if (queryAction.terms == null || queryAction.terms.Length < nTerms) { queryAction.terms = new int[nTerms]; } for (int i = 0; i < nTerms; i++) { queryAction.terms[i] = binaryReader.ReadInt32(); } int nUrls = binaryReader.ReadInt32(); queryAction.nUrls = nUrls; if (queryAction.urls == null || queryAction.urls.Length < nUrls) { queryAction.urls = new int[nUrls]; queryAction.domains = new int[nUrls]; } for (int i = 0; i < nUrls; i++) { queryAction.urls[i] = binaryReader.ReadInt32(); queryAction.domains[i] = binaryReader.ReadInt32(); } _reader.onQueryAction(queryAction); break; } case 3: { click.type = binaryReader.ReadByte(); click.sessionId = binaryReader.ReadInt32(); click.time = binaryReader.ReadInt32(); click.serpid = binaryReader.ReadInt32(); click.urlId = binaryReader.ReadInt32(); _reader.onClick(click); break; } } type = binaryReader.PeekChar(); } Console.Write(" \r"); _reader.onEndRead(); }
/// <summary> /// Wrzuca dane do bazy danych. /// </summary> private bool importData() { tic("Importing data:", true, true); const int FLUSH_ROWS = 200000; String[] toImport = new String[] { sessionTableName, queryTableName, queryTermTableName, queryUrlTableName, clickTableName, urlTableName }; foreach (String tableName in toImport) { tic(tableName, false, true); int[] types = getTableTypes(tableName); NpgsqlCommand cmd = new NpgsqlCommand(buildInsertCommand(tableName), connection); NpgsqlCopySerializer serializer = new NpgsqlCopySerializer(connection); NpgsqlCopyIn copyIn = new NpgsqlCopyIn(cmd, connection, serializer.ToStream); copyIn.Start(); using (BufferedBinaryReader reader = new BufferedBinaryReader(workDir + tableName)) { int lineCounter = 0; while (reader.PeekChar() > -1) { lineCounter++; for (int i = 0; i < types.Length; i++) { if (types[i] == 0) { int value = reader.ReadInt32(); serializer.AddInt32(value); } if (types[i] == 1) { bool value = reader.ReadBool(); serializer.AddBool(value); } } serializer.EndRow(); if ((lineCounter + 1) % FLUSH_ROWS == 0) { serializer.Flush(); } } Console.Write(String.Format("{0,-15}", String.Format("({0})", lineCounter))); } serializer.Flush(); serializer.Close(); copyIn.End(); toc(); } toc(true); return(true); }
/// <summary> /// Przepisuje dane do plików odpowiadających za poszczególne tabele. /// </summary> /// <param name="filename">Plik z danymi.</param> private bool rewriteData(String filename) { tic("Rewriting data:"); bool result = true; BinaryWriter session = null; BinaryWriter query = null; BinaryWriter queryTerm = null; BinaryWriter queryUrl = null; BinaryWriter click = null; BinaryWriter urlFile = null; session = new BinaryWriter(new FileStream(workDir + sessionTableName, FileMode.CreateNew)); query = new BinaryWriter(new FileStream(workDir + queryTableName, FileMode.CreateNew)); queryTerm = new BinaryWriter(new FileStream(workDir + queryTermTableName, FileMode.CreateNew)); queryUrl = new BinaryWriter(new FileStream(workDir + queryUrlTableName, FileMode.CreateNew)); click = new BinaryWriter(new FileStream(workDir + clickTableName, FileMode.CreateNew)); urlFile = new BinaryWriter(new FileStream(workDir + urlTableName, FileMode.CreateNew)); BinaryWriter[] writers = new BinaryWriter[] { session, query, queryTerm, queryUrl, click }; { bool any = false; foreach (var writer in writers) { any = any || (writer != null); } if (!any) { toc(); return(true); } } using (BufferedBinaryReader reader = new BufferedBinaryReader(filename)) { int lineCounter = 0; int q_idCount = 0; int queryUrl_id = 0; /// rzutuje parę (url, serpid) na (result_id, q_id) Dictionary <Tuple <int, int>, int> urlInfo = new Dictionary <Tuple <int, int>, int>(); int lastTime = 0; HashSet <int> urlsAdded = new HashSet <int>(); HashSet <int> queriesAdded = new HashSet <int>(); while (reader.PeekChar() > -1) { lineCounter++; if (LINES_LIMIT > 0 && LINES_LIMIT < lineCounter) { break; } byte type = reader.ReadByte(); if (type < 0 || type > 3) { Console.WriteLine("Incorrect file in line " + lineCounter); result = false; break; } int sessionId = reader.ReadInt32(); switch (type) { case 0: { int day = reader.ReadInt32(); int user = reader.ReadInt32(); session.Write((int)sessionId); session.Write((int)day); session.Write((int)user); urlInfo.Clear(); lastTime = 0; break; } case 1: case 2: { // TIME int time = reader.ReadInt32(); // SERPID int serpid = reader.ReadInt32(); // QUERYID int queryId = reader.ReadInt32(); int q_id = q_idCount++; query.Write((int)q_id); query.Write((int)queryId); query.Write((int)sessionId); query.Write((int)serpid); query.Write((int)(time - lastTime)); lastTime = time; query.Write((bool)(type == 2)); bool processQuery = !queriesAdded.Contains(queryId); if (processQuery) { queriesAdded.Add(queryId); } for (int i = reader.ReadInt32(); i > 0; i--) { int term = reader.ReadInt32(); if (processQuery) { queryTerm.Write((int)term); queryTerm.Write((int)queryId); } } for (int i = reader.ReadInt32(); i > 0; i--) { int url = reader.ReadInt32(); int domain = reader.ReadInt32(); Tuple <int, int> tuple = new Tuple <int, int>(url, serpid); if (!urlInfo.ContainsKey(tuple)) { urlInfo.Add(tuple, q_id); } if (processQuery) { queryUrl.Write((int)queryUrl_id); queryUrl_id++; queryUrl.Write((int)url); queryUrl.Write((int)queryId); if (!urlsAdded.Contains(url)) { urlFile.Write((int)url); urlFile.Write((int)domain); urlsAdded.Add(url); } } } break; } case 3: { // TIME int time = reader.ReadInt32(); // SERPID int serpid = reader.ReadInt32(); // URL int url = reader.ReadInt32(); int q_id = urlInfo[new Tuple <int, int>(url, serpid)]; click.Write((int)url); click.Write((int)q_id); click.Write((int)(time - lastTime)); lastTime = time; break; } } } } foreach (BinaryWriter writer in writers) { if (writer != null) { writer.Dispose(); } } toc(); return(result); }