Example #1
0
        public Protocol Parse(KnessetContext context)
        {
            PrepareHashsets(context); // prefetch some existing objects from the DB so that we can check quickly of objects are new or existing

            var       ret = new Protocol();
            XDocument doc = XDocument.Load(fileName); // load the xml document using linq2xml

            // find the header element of the document
            var docHeader = (from e in doc.Document.Root.Elements(pkg + "part")
                             where e.Attribute(pkg + "name").Value == "/word/header2.xml"
                             select e.Element(pkg + "xmlData").Element(w + "hdr").Elements()
                             .Skip(1)           // skip page number
                             .Take(1).ToArray() // take title
                             ).First();

            // find the body
            var docBody = (from e in doc.Document.Root.Elements(pkg + "part")
                           where e.Attribute(pkg + "name").Value == "/word/document.xml"
                           select e.Element(pkg + "xmlData").Element(w + "document").Element(w + "body")).First();

            // parse some protocol metadata from the header
            ParseHeaderMetadata(ret, docHeader);

            // parse rest of metadata and read all the content from the body
            ParseBody(ret, docBody, context);

            return(ret);
        }
 public AddProtocolWindow(string fileName)
 {
     InitializeComponent();
     try
     {
         // Load the protocol file because the window is shown
         // because the protocol object will be binded to the ui presented to the user.
         // we're using only one context object and not disposing each time because another
         // context would consider fetched db entities as new and try to insert them.
         // the down side is that the app memory usage will not be optimal when importing many protocols.
         context     = new KnessetContext();
         fileParser  = new ProtocolFileParser(fileName);
         Protocol    = fileParser.Parse(context);
         DataContext = Protocol; // Databind the UI to the protocol object.
         DisplayStatsDebug();
     }
     catch (Exception ex)
     {
         // if there was a problem show a message to the user, protocol will be null and the user
         // of this window will not show it.
         if (!AutoSaveAll)
         {
             MessageBox.Show(ex.ToString(), fileName, MessageBoxButton.OK, MessageBoxImage.Asterisk);
         }
     }
 }
Example #3
0
 private void DeleteGroup(object sender, RoutedEventArgs e)
 {
     try
     {
         if (GroupsBox.SelectedItems.Count == 0)
         {
             return;
         }
         using (KnessetContext context = new KnessetContext())
         {
             foreach (var wg in GroupsBox.SelectedItems.OfType <WordsGroup>())
             {
                 context.WordsGroups.Attach(wg);
                 context.WordsGroups.Remove(wg);
             }
             context.SaveChanges();
         }
         UIData.UpdateData(false, true);
     }
     catch (Exception ex)
     {
         MessageBox.Show(ex.ToString());
     }
     finally
     {
         Mouse.OverrideCursor = null;
     }
 }
Example #4
0
        private void InitStatistics(object sender, RoutedEventArgs e)
        {
            // note to db admins,
            // there is no reason this should be slow
            // try setting:
            // query_cache_limit=20K
            // query_cache_size=4M
            // query_cache_type=1
            try
            {
                Mouse.OverrideCursor = Cursors.Wait;

                var res = new GeneralStatisticsResult();
                using (var context = new KnessetContext())
                {
                    int numWordsSpoken = context.ParagraphWords.Count();
                    int numParagraphs  = context.Paragraphs.Count();
                    res.NumProtocols          = context.Protocols.Count();
                    res.ParagraphsPerProtocol = (float)numParagraphs / res.NumProtocols;

                    /*
                     * calculate speakers by protocol
                     * the key is that each speaker has only one paragraph with pn_ph_number=1
                     * SELECT AVG(sq.c) as v FROM (
                     *      SELECT COUNT(*) as c
                     *      FROM paragraph p
                     *      WHERE p.pn_pg_number = 1
                     *      GROUP BY p.c_name, p.pr_number
                     * ) as sq
                     */
                    res.SpeakersPerProtocol = (float)context.Paragraphs
                                              .Where(x => x.pn_pg_number == 1)
                                              .GroupBy(x => new { x.c_name, x.pr_number })
                                              .Select(r => r.Count())
                                              .Average();
                    res.ParagraphsPerProtocolSpeaker = res.ParagraphsPerProtocol / res.SpeakersPerProtocol;
                    res.WordsPerProtocol             = (float)numWordsSpoken / res.NumProtocols;
                    res.WordsPerParagraph            = (float)numWordsSpoken / numParagraphs;
                }
                DataContext = res;
            }
            catch (Exception ex)
            {
                MessageBox.Show(ex.ToString());
                Mouse.OverrideCursor = null;
                Close();
            }
            finally
            {
                Mouse.OverrideCursor = null;
            }
        }
 public WordFrequenciesData()
 {
     try
     {
         using (KnessetContext context = new KnessetContext())
         {
             var emptyStringList = new string[] { string.Empty };
             AllCommittees = emptyStringList.Union(context.Committees.Select(x => x.c_name).OrderBy(x => x)).ToList();
             AllSpeakers   = emptyStringList.Union(context.Paragraphs.Select(x => x.pn_name).Distinct().OrderBy(x => x)).ToList();
         }
     }
     catch (Exception ex)
     {
         Console.Error.WriteLine(ex);
     }
 }
Example #6
0
 private void LoadProtocol(Protocol p, Paragraph initialPg)
 {
     Mouse.OverrideCursor = Cursors.Wait;
     try
     {
         using (KnessetContext context = new KnessetContext())
         {
             // load protocol + paragraphs + words
             protocol = context.Protocols.Include("paragraphs.words").First(x => x.c_name == p.c_name && x.pr_number == p.pr_number);
             var entry = context.Entry(protocol);
             entry.Collection(pr => pr.persence).Load();    // load presence
             entry.Collection(pr => pr.invitations).Load(); // load invitations
             if (initialPg != null)
             {
                 initialParagraph = protocol.paragraphs.FirstOrDefault(pg => pg.pg_number == initialPg.pg_number);
             }
         }
     }
     finally
     {
         Mouse.OverrideCursor = null;
     }
 }
Example #7
0
        // parse rest of metadata and read all the content from the body
        private void ParseBody(Protocol ret, XElement docBody, KnessetContext context)
        {
            var    state = ProtocolState.InitialScan; // uae a state machine for different protocol parts
            bool   isInvitationsTableRtl = false;
            Person currentSpeaker        = null;

            foreach (XElement el in docBody.Elements())
            {
                switch (state)
                {
                case ProtocolState.InitialScan:
                    if (IsCustomXml(el, "פרוטוקול"))
                    {
                        // try to read the protocol number from the custom XML
                        ret.pr_number = int.Parse(el.Element(w + "customXmlPr").Elements(w + "attr").First(x => x.Attribute(w + "name").Value == "Num").Attribute(w + "val").Value);
                        if (ret.pr_number == 0)
                        {     // if the protocol number is not in the custom XML, read from the protocol title
                            int tmp;
                            // handle the last number in the protocol title as the protocol number
                            string pNumString = (from r in el.Element(w + "p").Elements(w + "r")
                                                 let str = r.Element(w + "t").Value.Trim()
                                                           let rgxMatch = Regex.Match(str, "(\\d+)\\s*$")
                                                                          where rgxMatch.Success
                                                                          select rgxMatch.Groups[1].Value).LastOrDefault() ?? "";
                            if (int.TryParse(Regex.Replace(pNumString, "[^\\d]", ""), out tmp))
                            {
                                ret.pr_number = tmp;
                            }
                            // else we have a protocol without a number (?)
                        }
                        // read the protocol date from the custom XML attribute.
                        ret.pr_date = DateTime.ParseExact(el.Element(w + "customXmlPr").Elements(w + "attr").First(x => x.Attribute(w + "name").Value == "Date").Attribute(w + "val").Value, "dd/MM/yyyy", CultureInfo.InvariantCulture);
                        // the next state is some protocol info
                        state = ProtocolState.ProtocolInfo;
                    }
                    break;

                case ProtocolState.ProtocolInfo:
                    if (IsCustomXml(el, "סדר_יום"))     // read info until we read the Agenda
                    {
                        state = ProtocolState.Agenda;
                    }
                    break;

                case ProtocolState.Agenda:
                    if (IsSubject(el))
                    {
                        // we've reached the subject but we'll have another subject so we ignore this one
                        state = ProtocolState.SubjectLong;
                    }
                    break;

                case ProtocolState.SubjectLong:
                    // ignore everything until we reach the list of members that are present
                    if (ContainsCustomXml(el, "חברי_הוועדה"))
                    {
                        state = ProtocolState.CommitteeMembers;
                    }
                    break;

                case ProtocolState.CommitteeMembers:
                    // read the list of members that are present until we reach the invitations part
                    if (el.Name == w + "p")
                    {
                        if (ContainsCustomXml(el, "מוזמנים"))
                        {
                            // finished presence and now moving on to the invitations list
                            state = ProtocolState.Invitations;
                            break;
                        }
                        else if (IsSubject(el))
                        {
                            // no invitations
                            state = ReadSubject(ret, el);
                            break;
                        }
                        string content = ReadParagraph(el);
                        if (string.IsNullOrWhiteSpace(content) || content == "חברי הכנסת:")
                        {
                            break;
                        }
                        if (content.Contains("מוזמנים:"))
                        {
                            // in some rare cases the invitations title is not marked by custom XML and we need to check the text for it.
                            state = ProtocolState.Invitations;
                            break;
                        }
                        // if this is not a special line we got a presence to add.
                        var person = FindOrAddPerson(content);
                        if (!newPresence.Any(x => x.person == person))
                        {
                            newPresence.Add(new Presence {
                                person = person, protocol = ret
                            });
                        }
                    }
                    break;

                case ProtocolState.Invitations:
                    if (el.Name == w + "tbl")
                    {
                        // invitations of non-committee members are stored in a table
                        // if table is from right to left the first column has the name of the person
                        // else the last column (3rd)
                        isInvitationsTableRtl = el.Descendants(w + "bidiVisual").Any();
                        var items = from tr in el.Elements(w + "tr")
                                    select ReadParagraph((isInvitationsTableRtl ? tr.Elements(w + "tc").First() : tr.Elements(w + "tc").Last()).Element(w + "p"));

                        foreach (var invitation in items.Distinct())
                        {
                            newInvitations.Add(new Invitation {
                                person = FindOrAddPerson(invitation), protocol = ret
                            });
                        }
                    }
                    else if (IsSubject(el))
                    {
                        // after the invitations table we go on reading until we reach the header before the protocol starts
                        // no we read the protocol title.
                        state = ReadSubject(ret, el);
                    }
                    break;

                case ProtocolState.Subject:
                    // after the subject heading we expect the actual protocol to start.
                    state = ProtocolState.Talking;
                    goto case ProtocolState.Talking;

                case ProtocolState.Talking:
                    if (currentSpeaker != null && el.Name == w + "p")
                    {
                        // if we're in a 'normal' paragraph and we have a speaker then we the parahraph with the speaker.
                        string paragraphContent = ReadParagraph(el);
                        AddParagraph(ret, currentSpeaker, paragraphContent, context);
                    }
                    else if (ContainsCustomXml(el, "יור") || ContainsCustomXml(el, "דובר") || ContainsCustomXml(el, "דובר_המשך") || ContainsCustomXml(el, "אורח"))
                    {
                        var contentElem = (el.Name == w + "p") ? el.Element(w + "customXml") : el.Element(w + "p");
                        // this is a new speaker starting to talk
                        string sprekerName = ReadParagraph(contentElem);
                        if (sprekerName.EndsWith(":"))
                        {
                            sprekerName = sprekerName.Substring(0, sprekerName.Length - 1);
                        }
                        currentSpeaker = FindOrAddPerson(sprekerName);
                    }
                    else if (IsCustomXml(el, "קריאה"))
                    {
                        // sometimes people make indistinguishable noises, just ignore...
                        currentSpeaker = null;
                    }
                    else if (IsCustomXml(el, "סיום"))
                    {
                        // we've reached the end of the protocol.
                        state = ProtocolState.Finished;
                    }
                    break;

                case ProtocolState.Finished:
                    break;
                }
            }
            if (state != ProtocolState.Finished && newParagraphs.Count == 0)
            {
                throw new Exception(string.Format("Protocol parsing failed, did not pass state {0}", state));
            }
        }
Example #8
0
 //      preload db objects so we don't try to add them again by mistake and so we can reference them from new objects.
 private void PrepareHashsets(KnessetContext context)
 {
     existingPersons = context.Persons.ToDictionary(x => x.pn_name);
     existingWords   = context.Words.ToDictionary(x => x.word);
 }
Example #9
0
        // helpers:

        //      parse paragraph words and fillers, convert text to objects with word offsets, etc...
        private void AddParagraph(Protocol protocol, Person speaker, string paragraphContent, KnessetContext context)
        {
            if (string.IsNullOrWhiteSpace(paragraphContent))
            {
                return;
            }
            if (!speakerParahraphs.ContainsKey(speaker.pn_name))
            {
                speakerParahraphs.Add(speaker.pn_name, 0);
            }
            Paragraph p = new Paragraph
            {
                protocol     = protocol,
                speaker      = speaker,
                pg_number    = newParagraphs.Count + 1,
                pn_pg_number = ++speakerParahraphs[speaker.pn_name]
            };

            newParagraphs.Add(p);
            int           offset      = 0;
            StringBuilder fillBuilder = new StringBuilder();

            paragraphReader.Read(paragraphContent,
                                 word =>
            {
                Word wordObj;
                if (!existingWords.ContainsKey(word))
                {
                    wordObj = new Word {
                        word = word
                    };
                    newWords.Add(wordObj);
                    existingWords.Add(word, wordObj);
                }
                else
                {
                    wordObj = existingWords[word];
                }
                newParagraphWords.Add(new ParagraphWord
                {
                    paragraph   = p,
                    WordObj     = wordObj,
                    pg_offset   = offset,
                    word_number = p.pg_num_words++
                });
                offset += word.Length;
            },
                                 filler =>
            {
                fillBuilder.Append(filler);
                offset += filler.Length;
            });
            p.pg_space_fillers = fillBuilder.ToString();
        }
Example #10
0
        private void SaveGroup(object sender, RoutedEventArgs e)
        {
            // first do validations that do not reqire db access

            string grpName = groupNameTxt.Text;

            if (grpName.Length == 0)
            {
                MessageBox.Show("חובה להזין שם לקבוצה");
                groupNameTxt.Focus();
                return;
            }
            ParagraphReader  reader = new ParagraphReader(); // use paragraph read to remove all "non-word" chars and split into words
            HashSet <string> items  = new HashSet <string>(reader.ReadWords(wordsListTxt.Text));

            if (items.Count == 0)
            {
                MessageBox.Show("חובה להזין מילים לקבוצה");
                wordsListTxt.SelectAll();
                return;
            }
            else if (items.Count > WordsGroup.MaxItemsInGroup)
            {
                MessageBox.Show(string.Format("לא ניתן להזין יותר מ-{0} מילים בקבוצה", WordsGroup.MaxItemsInGroup));
                return;
            }
            try
            {
                using (KnessetContext context = new KnessetContext())
                {
                    // now do validations that do reqire db access
                    WordsGroup existing = context.WordsGroups.Find(grpName);
                    if (existing != null)
                    {
                        MessageBox.Show("כבר קיימת קבוצה עם שם זה");
                        return;
                    }

                    // input is OK, save the new group, if a word is not in the words relation add it
                    // (we might define groups before loading protocols)
                    WordsGroup group = new WordsGroup {
                        g_name = grpName
                    };
                    context.WordsGroups.Add(group);

                    foreach (var wordStr in items)
                    {
                        Word wordObj = context.Words.Find(wordStr);
                        if (wordObj == null)
                        {
                            wordObj = new Word {
                                word = wordStr
                            };
                            context.Words.Add(wordObj);
                        }
                        context.WordInGroups.Add(new WordInGroup {
                            wordsGroup = group, WordObj = wordObj
                        });
                    }
                    context.SaveChanges(); // commit all changes to DB
                }
                DialogResult = true;       // can be used by parent window - marks success
                Close();
            }
            catch (Exception ex)
            {
                MessageBox.Show(ex.ToString());
            }
        }
Example #11
0
        private void CreateIndex(object sender, RoutedEventArgs e)
        {
            var selectedGroups    = GroupsBox.SelectedItems.OfType <WordsGroup>().ToList();
            var selectedProtocols = ProtocolsBox.SelectedItems.OfType <Protocol>().ToList();

            if (selectedGroups.Count == 0 && selectedProtocols.Count == 0)
            {
                MessageBox.Show("חובה לבחור פרוטוקולים או קבוצות");
                return;
            }
            SaveFileDialog sfd = new SaveFileDialog {
                Filter = "Text Index File | *.txt", AddExtension = true
            };

            if (sfd.ShowDialog().GetValueOrDefault()) // if user has chosen dest file and approved
            {
                try
                {
                    Mouse.OverrideCursor = Cursors.Wait;

                    using (StreamWriter sw = new StreamWriter(sfd.FileName, false, Encoding.Unicode)) // create a write stream to the file
                    {
                        using (KnessetContext context = new KnessetContext())                         // create a db connection
                        {
                            string _lastWord = null;
                            Action <ParagraphWord> writePW = (ParagraphWord pw) => // a lambda func to write a single word to the index
                            {
                                if (pw.word != _lastWord)
                                {
                                    _lastWord = pw.word;
                                    sw.WriteLine(pw.word);
                                }
                                sw.WriteLine("\t{0} [{1}] Paragraph {2} Word {3}", pw.c_name, pw.pr_number, pw.pg_number, pw.word_number);
                                sw.WriteLine("\t{0} [{1}] Speaker {2} Paragraph {3} Offset {4}", pw.c_name, pw.pr_number, pw.paragraph.pn_name, pw.paragraph.pn_pg_number, pw.pg_offset);
                            };
                            IQueryable <ParagraphWord> wordsListQuery = context.ParagraphWords.Include("paragraph"); // a query variable to fetch words with their paragraph info
                            if (selectedGroups.Count > 0)
                            {
                                string[] selectedGroupsWords = distinctFromGroups(selectedGroups);
                                wordsListQuery = wordsListQuery.Where(x => selectedGroupsWords.Contains(x.word)); // filter by word group[s]
                            }
                            if (selectedProtocols.Count == 0)
                            {
                                foreach (var pw in wordsListQuery.OrderBy(x => x.word)) // if not filtering by protocol just write all the results
                                {
                                    writePW(pw);
                                }
                            }
                            else
                            {
                                // else fetch for each protocol seperatly (we do not have a way to create an OR condition as large as we want)
                                // we prefetch all the data because we need to sort in memoty by the word
                                // this has OK performence, if it wouldn't we would run a custom SQL query.
                                List <ParagraphWord> wordsList = new List <ParagraphWord>();
                                foreach (var protocol in selectedProtocols)
                                {
                                    wordsList.AddRange(wordsListQuery.Where(x => x.c_name == protocol.c_name && x.pr_number == protocol.pr_number));
                                }
                                foreach (var pw in wordsList.OrderBy(x => x.word))
                                {
                                    writePW(pw);
                                }
                            }
                        }
                    }
                }
                catch (Exception ex)
                {
                    MessageBox.Show(ex.ToString());
                    return;
                }
                finally
                {
                    Mouse.OverrideCursor = null;
                }
                // display a success message to the user
                MessageBox.Show(sfd.FileName, "האינדקס נשמר", MessageBoxButton.OK, MessageBoxImage.Information);
            }
        }