static void ProcessRecord ( [NotNull] MarcRecord record ) { string worklist = record.FM(920); if (worklist != "PAZK" && worklist != "SPEC") { return; } int count = record.FM(999).SafeToInt32(); string formatted = formatter.FormatRecord(record); List <int> words = new List <int>(); foreach (Match match in regex.Matches(formatted)) { string word = match.Value.ToUpperInvariant(); if (word.Length >= 3 && !stopwords.IsStopWord(word)) { int id; if (!dictionary.TryGetValue(word, out id)) { id = ++lastId; dictionary.Add(word, id); } if (!words.Contains(id)) { words.Add(id); } } } if (words.Count != 0) { goodRecords++; if (words.Count > longest) { longest = words.Count; } BookData data = new BookData { Count = count, Mfn = record.Mfn, Words = words.ToArray() }; data.SaveToStream(writer); } }
static void Main(string[] args) { if (args.Length != 1) { Console.WriteLine("Need 1 argument"); return; } string inputFileName = args[0]; Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); try { stopwords = IrbisStopWords.ParseFile("IBIS.STW"); string source = File.ReadAllText("words.pft"); formatter = new PftFormatter() { Program = PftUtility.CompileProgram(source) }; //DataflowLinkOptions linkOptions = new DataflowLinkOptions //{ // PropagateCompletion = true //}; //ExecutionDataflowBlockOptions executionOptions // = new ExecutionDataflowBlockOptions //{ // MaxDegreeOfParallelism = 4 //}; //processBlock = new ActionBlock<MarcRecord> // ( // (Action<MarcRecord>)ProcessRecord, // executionOptions // ); using (FileStream stream = File.Create("words.bin")) using (writer = new BinaryWriter(stream)) using (accessor = new DirectAccess64(inputFileName)) { //maxMfn = accessor.GetMaxMfn(); maxMfn = 150000; Console.WriteLine("Max MFN={0}", maxMfn); // Сначала считываем все записи for (int mfn = 1; mfn < maxMfn; mfn++) { ReadRecord(mfn); } } using (StreamWriter textWriter = File.CreateText("words.dic")) { string[] keys = dictionary.Keys.ToArray(); Array.Sort(keys); foreach (string key in keys) { textWriter.WriteLine("{0}\t{1}", key, dictionary[key]); } } // Дожидаемся завершения // processBlock.Complete(); // processBlock.Completion.Wait(); Console.WriteLine ( "Good records={0}, dictionary size={1}, longest array={2}", goodRecords, dictionary.Count, longest ); DictionaryCounterInt32 <int> counter = new DictionaryCounterInt32 <int>(); using (FileStream stream = File.OpenRead("words.bin")) using (BinaryReader reader = new BinaryReader(stream)) { while (stream.Position < stream.Length) { BookData data = new BookData(); data.RestoreFromStream(reader); foreach (int word in data.Words) { counter.Increment(word); } } } int maxCount = counter.Values.Max(); int threshold = maxCount / 5 + 1; Console.WriteLine ( "Max count={0}, threshold={1}", maxCount, threshold ); using (FileStream stream = File.OpenRead("words.bin")) using (BinaryReader reader = new BinaryReader(stream)) using (StreamWriter textWriter = File.CreateText("words.csv")) { while (stream.Position < stream.Length) { BookData data = new BookData(); data.RestoreFromStream(reader); int i; for (i = 0; i < data.Words.Length; i++) { textWriter.Write("{0},", data.Words[i]); } for (; i < longest; i++) { textWriter.Write("0,"); } textWriter.WriteLine("{0}", data.Count); } } Console.WriteLine("Complete"); } catch (Exception exception) { Console.WriteLine(exception); } stopwatch.Stop(); TimeSpan elapsed = stopwatch.Elapsed; Console.WriteLine("Elapsed: {0}", elapsed.ToAutoString()); }