예제 #1
0
        /// <summary>
        /// Read swf (header and tags), this is the only
        /// public method of <see cref="SwfDotNet.IO.SwfReader">SwfReader</see>
        /// with <see cref="SwfDotNet.IO.SwfReader.Close">Close</see> and
        /// <see cref="SwfDotNet.IO.SwfReader.ReadSwfHeader">ReadSwfHeader</see> methods.
        /// The returned <see cref="SwfDotNet.IO.Swf">Swf</see> object contains swf headers informations and the
        /// tags list.
        /// </summary>
        public Swf ReadSwf()
        {
            // compressed swf?
            if (br.PeekChar() == 'C')
            {
                Inflate();
            }

            SwfHeader header = new SwfHeader();

            header.ReadData(br);
            this.version = header.Version;

            tagList = new BaseTagCollection();

            bool readEndTag = false; //necessary for the 1 more byte bug

            while (br.BaseStream.Position < br.BaseStream.Length && !readEndTag)
            {
                BaseTag b = SwfReader.ReadTag(this.version, this.br, this.tagList);
                if (b != null)
                {
                    if (b is EndTag)
                    {
                        readEndTag = true;
                    }
                    tagList.Add(b);
                }
            }
            ;

            br.Close();

            return(new Swf(header, tagList));
        }
예제 #2
0
        public static List <Tuple <int, BinarySearchSet <int> > > GetMatrix(String filename)
        {
            var watch = Stopwatch.StartNew();

            var matrix = new List <Tuple <int, BinarySearchSet <int> > >();

            using (var reader = new BufferedBinaryReader(filename))
            {
                int   lineCounter = 0;
                float length      = reader.reader.BaseStream.Length / 100.0f;

                while (reader.PeekChar() > -1)
                {
                    if (++lineCounter % 100000 == 0)
                    {
                        Console.Write("                 \rRead: {0} %\r",
                                      (reader.reader.BaseStream.Position / length).ToString("0.000"));
                    }

                    int userId = reader.ReadInt32();
                    int nTerms = reader.ReadInt32();
                    var list   = new List <int>();
                    for (int i = 0; i < nTerms; i++)
                    {
                        list.Add(reader.ReadInt32());
                    }
                    matrix.Add(new Tuple <int, BinarySearchSet <int> >(userId, new BinarySearchSet <int>(list, Comparer <int> .Default)));
                }
            }

            watch.Stop();
            Console.WriteLine("Matrix loaded after: {0}", watch.Elapsed);

            return(matrix);
        }
예제 #3
0
        private void ReadData()
        {
            var path = PathResolver.UserMatrixOutputProcessed;

            _list = new List <Tuple <int, List <int> > >();
            using (var reader = new BufferedBinaryReader(path))
            {
                while (reader.PeekChar() > -1)
                {
                    int currentUser = reader.ReadInt32();
                    var usersList   = new List <int>();
                    for (int i = reader.ReadInt32(); i > 0; i--)
                    {
                        int userId = reader.ReadInt32();
                        usersList.Add(userId);
                    }
                    _list.Add(new Tuple <int, List <int> >(currentUser, usersList));
                }
            }
        }
예제 #4
0
        private static void getTopUrls(string input, string output)
        {
            List <Tuple <int, int> > allUrls = new List <Tuple <int, int> >();

            Dictionary <int, int> urlsCount = new Dictionary <int, int>();

            Stopwatch watch = new Stopwatch();

            watch.Start();

            using (BufferedBinaryReader reader = new BufferedBinaryReader(input))
            {
                HashSet <int> processedQueries = new HashSet <int>();

                while (reader.PeekChar() > -1)
                {
                    byte type = reader.ReadByte();

                    int session = reader.ReadInt32();

                    switch (type)
                    {
                    case 0:
                        // DAY
                        reader.ReadInt32();
                        int user = reader.ReadInt32();
                        break;

                    case 1:
                    case 2:
                    {
                        // TIME
                        reader.ReadInt32();
                        // SERPID
                        reader.ReadInt32();
                        // QUERYID
                        int  query_id = reader.ReadInt32();
                        bool process  = !processedQueries.Contains(query_id);
                        if (process)
                        {
                            processedQueries.Add(query_id);
                        }

                        // TERMS
                        int termsN = reader.ReadInt32();
                        for (int i = termsN; i > 0; i--)
                        {
                            int term = reader.ReadInt32();
                        }
                        // URLS & DOMAINS
                        for (int i = reader.ReadInt32(); i > 0; i--)
                        {
                            int url    = reader.ReadInt32();
                            int domain = reader.ReadInt32();
                            if (!process)
                            {
                                continue;
                            }

                            if (urlsCount.ContainsKey(url))
                            {
                                urlsCount[url]++;
                            }
                            else
                            {
                                urlsCount.Add(url, 1);
                            }
                        }
                        break;
                    }

                    case 3:
                    {
                        // TIME
                        reader.ReadInt32();
                        // SERPID
                        reader.ReadInt32();
                        // URLS
                        int url = reader.ReadInt32();
                        break;
                    }
                    }
                }
            }

            // przeniesienie z mapy do listy
            foreach (var element in urlsCount)
            {
                allUrls.Add(new Tuple <int, int>(element.Key, element.Value));
            }
            urlsCount.Clear();
            urlsCount = new Dictionary <int, int>();
            GC.Collect();

            watch.Stop();
            Console.WriteLine("... zakończono po {0} ({1})", watch.ElapsedMilliseconds, allUrls.Count);
            watch.Restart();

            // sortowanie po malejącej liczbie wystąpień
            allUrls.Sort((o1, o2) => { return(o2.Item2 - o1.Item2); });

            watch.Stop();
            Console.WriteLine("Zakończono sortowanie danych po " + watch.ElapsedMilliseconds);
            watch.Restart();

            // zostawienie top 1000 urli
            if (allUrls.Count > 1000)
            {
                allUrls.RemoveRange(1000, allUrls.Count - 1000);
            }

            watch.Stop();
            Console.WriteLine("Wyczyszczono liste po " + watch.ElapsedMilliseconds);
            watch.Restart();

            using (StreamWriter writer = new StreamWriter(output))
            {
                foreach (var element in allUrls)
                {
                    writer.WriteLine(element.Item1 + "\t" + element.Item2);
                }
            }

            watch.Stop();
            Console.WriteLine("Zakończono zapisywanie danych po " + watch.ElapsedMilliseconds);
        }
예제 #5
0
        private static void getTermsFromTop(string logFilename, string topFilename, StreamWriter output)
        {
            const int N = 100;
            List <Tuple <int, int> > terms = new List <Tuple <int, int> >();

            List <int>[]  queries          = new List <int> [N];
            HashSet <int> processedQueries = new HashSet <int>();

            for (int i = 0; i < N; i++)
            {
                queries[i] = new List <int>();
            }

            using (StreamReader reader = new StreamReader(topFilename))
            {
                string[] seps = new string[] { "\t" };

                for (int i = 0; i < N; i++)
                {
                    string   line  = reader.ReadLine();
                    string[] array = line.Split(seps, StringSplitOptions.None);
                    terms.Add(new Tuple <int, int>(Int32.Parse(array[0]), Int32.Parse(array[1])));
                }
            }

            List <int> sortedTerms = new List <int>();

            foreach (var term in terms)
            {
                sortedTerms.Add(term.Item1);
            }

            sortedTerms.Sort();

            using (BufferedBinaryReader reader = new BufferedBinaryReader(logFilename))
            {
                HashSet <int> procesedQueries = new HashSet <int>();

                while (reader.PeekChar() > -1)
                {
                    byte type = reader.ReadByte();

                    int session = reader.ReadInt32();

                    switch (type)
                    {
                    case 0:
                        // DAY
                        reader.ReadInt32();
                        // USER
                        reader.ReadInt32();
                        break;

                    case 1:
                    case 2:
                    {
                        // TIME
                        reader.ReadInt32();
                        // SERPID
                        reader.ReadInt32();
                        // QUERYID
                        int query_id = reader.ReadInt32();

                        bool process = !processedQueries.Contains(query_id);
                        if (process)
                        {
                            processedQueries.Add(query_id);
                        }
                        // TERMS
                        int termsN = reader.ReadInt32();
                        for (int i = termsN; i > 0; i--)
                        {
                            int term = reader.ReadInt32();
                            if (!process)
                            {
                                continue;
                            }

                            for (int index = 0; index < sortedTerms.Count; index++)
                            {
                                if (sortedTerms[index] > term)
                                {
                                    break;
                                }

                                if (sortedTerms[index] == term)
                                {
                                    queries[index].Add(query_id);
                                }
                            }
                        }
                        // URLS & DOMAINS
                        int n = reader.ReadInt32();
                        for (int i = n; i > 0; i--)
                        {
                            int url    = reader.ReadInt32();
                            int domain = reader.ReadInt32();
                        }
                        break;
                    }

                    case 3:
                    {
                        // TIME
                        reader.ReadInt32();
                        // SERPID
                        reader.ReadInt32();
                        // URLS
                        int url = reader.ReadInt32();
                        break;
                    }
                    }
                }
            }

            {
                output.WriteLine("Top " + terms.Count + " terms:");
                for (int i = 0; i < terms.Count; i++)
                {
                    output.WriteLine(terms[i].Item1 + "\t" + terms[i].Item2);
                }

                for (int i = 0; i < terms.Count; i++)
                {
                    output.WriteLine();
                    int index = sortedTerms.IndexOf(terms[i].Item1);

                    queries[index].Sort();

                    output.WriteLine(sortedTerms[index] + ":");
                    foreach (var query in queries[index])
                    {
                        output.WriteLine(query);
                    }
                }

                output.WriteLine();
            }
        }
예제 #6
0
        public void CompareUsers()
        {
            double simSum   = 0;
            int    simCount = 0;
            //List<Tuple<UserId, List<Tuple<UserId, Similarity>>>>
            //matrix = new List<Tuple<int, List<Tuple<int, int>>>>();
            var count = users.Count;
            var path  = PathResolver.UserMatrixOutput;
            var path2 = PathResolver.UserMatrixOutputProcessed;

            using (var writer = new BinaryWriter(new FileStream(path, FileMode.CreateNew)))
            {
                for (int i = 0; i < count; i++)
                {
                    var list = new List <Tuple <int, int> >();
                    for (int j = i + 1; j < count; j++)
                    {
                        var res = CompareTwoUsers(users[i], users[j]);
                        if (res == 0)
                        {
                            continue;
                        }
                        list.Add(new Tuple <int, int>(j, res));
                        simSum += res;
                        simCount++;
                    }

                    //write UserId
                    writer.Write(list[i].Item1);
                    writer.Write(list.Count);
                    foreach (var element in list)
                    {
                        writer.Write(element.Item1);
                        writer.Write(element.Item2);
                    }
                    //matrix.Add(new Tuple<int, List<Tuple<int, int>>>(i,list));
                }
            }

            double minVal = 1.5 /* ?? */ * simSum / simCount;

            using (var writer = new BinaryWriter(new FileStream(path2, FileMode.CreateNew)))
                using (var reader = new BufferedBinaryReader(path))
                {
                    while (reader.PeekChar() > -1)
                    {
                        int currentUser = reader.ReadInt32();
                        var usersList   = new List <int>();
                        for (int i = reader.ReadInt32(); i > 0; i--)
                        {
                            int userId = reader.ReadInt32();
                            int sim    = reader.ReadInt32();
                            if (sim > minVal)
                            {
                                usersList.Add(userId);
                            }
                        }

                        if (usersList.Count > 0)
                        {
                            writer.Write(currentUser);
                            writer.Write(usersList.Count);
                            foreach (var userId in usersList)
                            {
                                writer.Write(userId);
                            }
                        }
                    }
                }
        }
예제 #7
0
        public void Read()
        {
            var binaryReader = new BufferedBinaryReader(_binaryReader);

            Metadata    metadata    = new Metadata();
            QueryAction queryAction = new QueryAction();
            Click       click       = new Click();

            float length = binaryReader.reader.BaseStream.Length / 100.0f;

            int lineCounter = 0;

            _reader.onBeginRead();

            int type = binaryReader.PeekChar();

            while (type > -1)
            {
                if (++lineCounter % 100000 == 0)
                {
                    Console.Write("                 \rRead: {0} %\r",
                                  (binaryReader.reader.BaseStream.Position / length).ToString("0.000"));
                    if (lineCounter % 2000000 == 0)
                    {
                        GC.Collect();
                    }
                }

                switch (type)
                {
                case 0:
                {
                    metadata.type      = binaryReader.ReadByte();
                    metadata.sessionId = binaryReader.ReadInt32();
                    metadata.day       = binaryReader.ReadInt32();
                    metadata.userId    = binaryReader.ReadInt32();

                    _reader.onMetadata(metadata);
                    break;
                }

                case 1:
                case 2:
                {
                    queryAction.type      = binaryReader.ReadByte();
                    queryAction.sessionId = binaryReader.ReadInt32();
                    queryAction.time      = binaryReader.ReadInt32();
                    queryAction.serpid    = binaryReader.ReadInt32();
                    queryAction.queryId   = binaryReader.ReadInt32();

                    int nTerms = binaryReader.ReadInt32();
                    queryAction.nTerms = nTerms;
                    if (queryAction.terms == null || queryAction.terms.Length < nTerms)
                    {
                        queryAction.terms = new int[nTerms];
                    }

                    for (int i = 0; i < nTerms; i++)
                    {
                        queryAction.terms[i] = binaryReader.ReadInt32();
                    }

                    int nUrls = binaryReader.ReadInt32();
                    queryAction.nUrls = nUrls;
                    if (queryAction.urls == null || queryAction.urls.Length < nUrls)
                    {
                        queryAction.urls    = new int[nUrls];
                        queryAction.domains = new int[nUrls];
                    }

                    for (int i = 0; i < nUrls; i++)
                    {
                        queryAction.urls[i]    = binaryReader.ReadInt32();
                        queryAction.domains[i] = binaryReader.ReadInt32();
                    }

                    _reader.onQueryAction(queryAction);
                    break;
                }

                case 3:
                {
                    click.type      = binaryReader.ReadByte();
                    click.sessionId = binaryReader.ReadInt32();
                    click.time      = binaryReader.ReadInt32();
                    click.serpid    = binaryReader.ReadInt32();
                    click.urlId     = binaryReader.ReadInt32();

                    _reader.onClick(click);
                    break;
                }
                }

                type = binaryReader.PeekChar();
            }

            Console.Write("                  \r");

            _reader.onEndRead();
        }
예제 #8
0
        /// <summary>
        /// Wrzuca dane do bazy danych.
        /// </summary>
        private bool importData()
        {
            tic("Importing data:", true, true);

            const int FLUSH_ROWS = 200000;

            String[] toImport = new String[]
            {
                sessionTableName, queryTableName, queryTermTableName, queryUrlTableName, clickTableName, urlTableName
            };

            foreach (String tableName in toImport)
            {
                tic(tableName, false, true);

                int[] types = getTableTypes(tableName);

                NpgsqlCommand        cmd        = new NpgsqlCommand(buildInsertCommand(tableName), connection);
                NpgsqlCopySerializer serializer = new NpgsqlCopySerializer(connection);
                NpgsqlCopyIn         copyIn     = new NpgsqlCopyIn(cmd, connection, serializer.ToStream);

                copyIn.Start();

                using (BufferedBinaryReader reader = new BufferedBinaryReader(workDir + tableName))
                {
                    int lineCounter = 0;

                    while (reader.PeekChar() > -1)
                    {
                        lineCounter++;

                        for (int i = 0; i < types.Length; i++)
                        {
                            if (types[i] == 0)
                            {
                                int value = reader.ReadInt32();
                                serializer.AddInt32(value);
                            }
                            if (types[i] == 1)
                            {
                                bool value = reader.ReadBool();
                                serializer.AddBool(value);
                            }
                        }

                        serializer.EndRow();

                        if ((lineCounter + 1) % FLUSH_ROWS == 0)
                        {
                            serializer.Flush();
                        }
                    }

                    Console.Write(String.Format("{0,-15}", String.Format("({0})", lineCounter)));
                }

                serializer.Flush();
                serializer.Close();
                copyIn.End();

                toc();
            }

            toc(true);

            return(true);
        }
예제 #9
0
        /// <summary>
        /// Przepisuje dane do plików odpowiadających za poszczególne tabele.
        /// </summary>
        /// <param name="filename">Plik z danymi.</param>
        private bool rewriteData(String filename)
        {
            tic("Rewriting data:");

            bool result = true;

            BinaryWriter session   = null;
            BinaryWriter query     = null;
            BinaryWriter queryTerm = null;
            BinaryWriter queryUrl  = null;
            BinaryWriter click     = null;
            BinaryWriter urlFile   = null;

            session   = new BinaryWriter(new FileStream(workDir + sessionTableName, FileMode.CreateNew));
            query     = new BinaryWriter(new FileStream(workDir + queryTableName, FileMode.CreateNew));
            queryTerm = new BinaryWriter(new FileStream(workDir + queryTermTableName, FileMode.CreateNew));
            queryUrl  = new BinaryWriter(new FileStream(workDir + queryUrlTableName, FileMode.CreateNew));
            click     = new BinaryWriter(new FileStream(workDir + clickTableName, FileMode.CreateNew));
            urlFile   = new BinaryWriter(new FileStream(workDir + urlTableName, FileMode.CreateNew));

            BinaryWriter[] writers = new BinaryWriter[] { session, query, queryTerm, queryUrl, click };

            {
                bool any = false;
                foreach (var writer in writers)
                {
                    any = any || (writer != null);
                }

                if (!any)
                {
                    toc();
                    return(true);
                }
            }

            using (BufferedBinaryReader reader = new BufferedBinaryReader(filename))
            {
                int lineCounter = 0;
                int q_idCount   = 0;
                int queryUrl_id = 0;

                /// rzutuje parę (url, serpid) na (result_id, q_id)
                Dictionary <Tuple <int, int>, int> urlInfo = new Dictionary <Tuple <int, int>, int>();
                int           lastTime     = 0;
                HashSet <int> urlsAdded    = new HashSet <int>();
                HashSet <int> queriesAdded = new HashSet <int>();

                while (reader.PeekChar() > -1)
                {
                    lineCounter++;

                    if (LINES_LIMIT > 0 && LINES_LIMIT < lineCounter)
                    {
                        break;
                    }

                    byte type = reader.ReadByte();
                    if (type < 0 || type > 3)
                    {
                        Console.WriteLine("Incorrect file in line " + lineCounter);
                        result = false;
                        break;
                    }

                    int sessionId = reader.ReadInt32();

                    switch (type)
                    {
                    case 0:
                    {
                        int day  = reader.ReadInt32();
                        int user = reader.ReadInt32();

                        session.Write((int)sessionId);
                        session.Write((int)day);
                        session.Write((int)user);

                        urlInfo.Clear();
                        lastTime = 0;
                        break;
                    }

                    case 1:
                    case 2:
                    {
                        // TIME
                        int time = reader.ReadInt32();
                        // SERPID
                        int serpid = reader.ReadInt32();
                        // QUERYID
                        int queryId = reader.ReadInt32();

                        int q_id = q_idCount++;

                        query.Write((int)q_id);
                        query.Write((int)queryId);
                        query.Write((int)sessionId);
                        query.Write((int)serpid);
                        query.Write((int)(time - lastTime));
                        lastTime = time;
                        query.Write((bool)(type == 2));

                        bool processQuery = !queriesAdded.Contains(queryId);
                        if (processQuery)
                        {
                            queriesAdded.Add(queryId);
                        }

                        for (int i = reader.ReadInt32(); i > 0; i--)
                        {
                            int term = reader.ReadInt32();

                            if (processQuery)
                            {
                                queryTerm.Write((int)term);
                                queryTerm.Write((int)queryId);
                            }
                        }

                        for (int i = reader.ReadInt32(); i > 0; i--)
                        {
                            int url    = reader.ReadInt32();
                            int domain = reader.ReadInt32();

                            Tuple <int, int> tuple = new Tuple <int, int>(url, serpid);
                            if (!urlInfo.ContainsKey(tuple))
                            {
                                urlInfo.Add(tuple, q_id);
                            }

                            if (processQuery)
                            {
                                queryUrl.Write((int)queryUrl_id);
                                queryUrl_id++;

                                queryUrl.Write((int)url);
                                queryUrl.Write((int)queryId);

                                if (!urlsAdded.Contains(url))
                                {
                                    urlFile.Write((int)url);
                                    urlFile.Write((int)domain);
                                    urlsAdded.Add(url);
                                }
                            }
                        }
                        break;
                    }

                    case 3:
                    {
                        // TIME
                        int time = reader.ReadInt32();
                        // SERPID
                        int serpid = reader.ReadInt32();
                        // URL
                        int url = reader.ReadInt32();

                        int q_id = urlInfo[new Tuple <int, int>(url, serpid)];

                        click.Write((int)url);
                        click.Write((int)q_id);
                        click.Write((int)(time - lastTime));
                        lastTime = time;

                        break;
                    }
                    }
                }
            }

            foreach (BinaryWriter writer in writers)
            {
                if (writer != null)
                {
                    writer.Dispose();
                }
            }

            toc();

            return(result);
        }