Exemplo n.º 1
0
 public SentencePairScoreClass(SentenceSpliterClass SScandsrc, SentenceSpliterClass SScandtgt)
 {  // we use the splited sentences
     debuga           = false; looksfine = false; rejectinfo = "";
     originalsentence = SScandsrc.originalsentence;
     sbsentence       = new StringBuilder("");
     nsrc             = SScandsrc.words.Length - 1;   // number of words
     ntgt             = SScandtgt.words.Length - 1;   // number of words
     WordScoresrc     = new WordScoreClass[nsrc + 1]; // has to be actual number of words
     WordScoretgt     = new WordScoreClass[ntgt + 1]; // has to be actual number of words
     //string auxsrc = " ";
     for (int i = 0; i <= nsrc; i += 1)
     {
         auxsrc              += "'" + SScandsrc.words[i] + "',";
         WordScoresrc[i]      = new WordScoreClass();
         WordScoresrc[i].word = SScandsrc.words[i];
     }
     // string auxtgt = " ";
     for (int i = 0; i <= ntgt; i += 1)
     {
         auxtgt              += "'" + SScandtgt.words[i] + "',";
         WordScoretgt[i]      = new WordScoreClass();
         WordScoretgt[i].word = SScandtgt.words[i];
     }
     auxsrc = auxsrc.Remove(auxsrc.Length - 1);
     auxtgt = auxtgt.Remove(auxtgt.Length - 1);
 }
Exemplo n.º 2
0
        static void Main(string[] args)
        {
            // 1 parameter db
            // 2 parameter src file
            // 3 parameter tgt file
            String[] arguments = Environment.GetCommandLineArgs();
            string   DBname;
            string   srcfile;
            string   tgtfile;
            double   liminf;
            double   limsup;

            //
            DBname  = arguments[1];
            srcfile = arguments[2];
            tgtfile = arguments[3];
            liminf  = Double.Parse(arguments[4], System.Globalization.CultureInfo.InvariantCulture);
            limsup  = Double.Parse(arguments[5], System.Globalization.CultureInfo.InvariantCulture);
            // liminf = 0.3; limsup = 100;
            // srcfile = (@"U:\tmp\fisher\DOGC_VERIFY_BK.ca");
            // tgtfile = (@"U:\tmp\fisher\DOGC_VERIFY_BK.es");


            StreamReader srsrc = new StreamReader(srcfile, Encoding.UTF8, true); // input file
            StreamReader srtgt = new StreamReader(tgtfile, Encoding.UTF8, true); // input file



            SentenceSpliterClass SScandsrc;
            SentenceSpliterClass SScandtgt;
            // now I have to find all posible targets for each word
            // DBname = @"U:\tmp\fisherD\laikaV2.db";
            // Open connection
            SQLiteConnection m_dbConnection = new SQLiteConnection(string.Format(
                                                                       "Data Source={0};Version=3;", DBname));
            var    command = new SQLiteCommand(m_dbConnection);
            var    da = new SQLiteDataAdapter();
            string sqlaux, sql, sqlaux2, sqlunrestricted;
            string sqlauxI, sqlaux2I; // inverse SQL

            sqlaux = "select src.keyword, tgt.keyword, srccand.freqinsample from src " +
                     "join srccand on src.indext=srccand.indexsrc " +
                     "join tgt on srccand.indextgt=tgt.indext " +
                     "where src.keyword in ( {0} ) " +
                     "and tgt.keyword in ( {1} ) " +
                     "order by freqinsample DESC ";
            sqlauxI = "select tgt.keyword, src.keyword, tgtcand.freqinsample from tgt " + // Same
                      "join tgtcand on tgt.indext=tgtcand.indexsrc " +                    //note at the end src is OK
                      "join src on tgtcand.indextgt=src.indext " +                        // the indextgt is ok
                      "where tgt.keyword in ( {0} ) " +
                      "and src.keyword in ( {1} ) " +
                      "order by freqinsample DESC ";
            sqlaux2 = "select src.keyword, tgt.keyword, srccand.freqinsample from src " +
                      "join srccand on src.indext=srccand.indexsrc " +
                      "join tgt on srccand.indextgt=tgt.indext " +
                      "where src.keyword in ( {0} ) " +
                      "order by freqinsample DESC ";
            sqlaux2I = "select tgt.keyword, src.keyword, tgtcand.freqinsample from tgt " + // Same
                       "join tgtcand on tgt.indext=tgtcand.indexsrc " +                    //note at the end src is OK
                       "join src on tgtcand.indextgt=src.indext " +                        // the indextgt is ok
                       "where tgt.keyword in ( {0} ) " +
                       "order by freqinsample DESC ";

            // test
            // Bulk loop reading strings
            int nlines    = 0;
            int nlinesbad = 0;

            // for totals
            int[] punct = new int[101];
            int[] totoov = new int[101];
            int[] totunktra = new int[101];
            int[] totnword = new int[101];
            int   ninfo, ninfocurrent, ninfototal;

            ninfo = 500; ninfocurrent = ninfo; ninfototal = 0;

            var logOk     = new logger(@"C:\VS2017\fisher\logOK.txt");
            var logOOV    = new logger(@"C:\VS2017\fisher\logOOV.txt");
            var logBadSrc = new logger(@"C:\VS2017\fisher\logBadSrc.txt");
            var logBadTgt = new logger(@"C:\VS2017\fisher\logBadTgt.txt");
            var logBadAll = new logger(@"C:\VS2017\fisher\logBadAll.txt");

            logOk.LogString("Hi babe");
            logOk.Close(); logOOV.Close(); logBadAll.Close(); logBadSrc.Close(); logBadTgt.Close();



            try
            {
                // CE = OpenDB(DBname); //CE has connection an command.
                m_dbConnection.Open();

                using (srsrc)
                {
                    using (srtgt)
                    {
                        while (srsrc.Peek() != -1)
                        {
                            // Read the streams
                            String linesrc = srsrc.ReadLine();
                            String linetgt = srtgt.ReadLine();
                            nlines   += 1;
                            SScandsrc = new SentenceSpliterClass(linesrc, false);
                            SScandtgt = new SentenceSpliterClass(linetgt, false);
                            SScandsrc.Split(); SScandtgt.Split();
                            // create structure to target score
                            var sp = new SentencePairScoreClass(SScandsrc, SScandtgt);
                            sp.debuga = true;
                            // now search results in db stored in dataset
                            sql                 = string.Format(sqlaux, sp.auxsrc, sp.auxtgt);
                            sqlunrestricted     = string.Format(sqlaux2, sp.auxsrc);
                            command.CommandText = sql;
                            DataSet ds             = new DataSet();
                            DataSet dsunrestricted = new DataSet();
                            da = new SQLiteDataAdapter(command);
                            da.Fill(ds, "WORDSCORE");
                            command.CommandText = sqlunrestricted;
                            da = new SQLiteDataAdapter(command);
                            da.Fill(dsunrestricted, "WORDSCORE");
                            sp.Scoresrc(ds, dsunrestricted);
                            // Inverse sentence
                            var spI = new SentencePairScoreClass(SScandtgt, SScandsrc);
                            spI.debuga = true;
                            // now search results in db stored in dataset
                            sql                 = string.Format(sqlauxI, sp.auxtgt, sp.auxsrc);
                            sqlunrestricted     = string.Format(sqlaux2I, sp.auxtgt);
                            command.CommandText = sql;
                            DataSet dsI             = new DataSet();
                            DataSet dsIunrestricted = new DataSet();
                            da = new SQLiteDataAdapter(command);
                            da.Fill(dsI, "WORDSCORE");
                            command.CommandText = sqlunrestricted;
                            da = new SQLiteDataAdapter(command);
                            da.Fill(dsIunrestricted, "WORDSCORE");
                            spI.Scoresrc(dsI, dsIunrestricted);
                            //  scores are in
                            // sp.closetoperfection and spI.closetoperfection
                            string auxs;
                            Console.WriteLine(linesrc);
                            Console.WriteLine(linetgt);
                            auxs = "                    | ".Remove(sp.closetoperfection / 5, 1).Insert(sp.closetoperfection / 5, ">");
                            Console.WriteLine(auxs);
                            auxs = "                    |".Remove(spI.closetoperfection / 5, 1).Insert(spI.closetoperfection / 5, ">");
                            Console.WriteLine(auxs);
                            // as the score is bases in sum of words, lets say less than 5 we ignored
                            sp.looksfine = true; spI.looksfine = true;
                            if (sp.looksfine)
                            {
                                if (sp.nmatchedu < 5)
                                {
                                    sp.looksfine = false; sp.rejectinfo = "LT5"; // Less than 5 matches";
                                }
                            }
                            if (spI.looksfine)
                            {
                                if (spI.nmatchedu < 5)
                                {
                                    spI.looksfine = false; spI.rejectinfo = "LT5"; // Less than 5 matches";
                                }
                            }
                            if (spI.looksfine | sp.looksfine)
                            {
                                if (sp.closetoperfection < 80)
                                {
                                    // source quality is bad
                                    sp.looksfine = false; sp.rejectinfo = "SRCQualityBab";
                                }
                                if (spI.closetoperfection < 80)
                                {
                                    spI.looksfine = false; spI.rejectinfo = "TGTQualityBab";
                                }
                            }
                            // OOV
                            //if (sp.looksfine & spI.looksfine)
                            // {
                            sp.OOVbalanced = true; spI.OOVbalanced = true;
                            int percOOV  = (sp.nsrc - sp.noov) * 100 / sp.nsrc;
                            int percOOVI = (spI.nsrc - spI.noov) * 100 / spI.nsrc;
                            // max 20% dif
                            if (Math.Abs(percOOV - percOOVI) > 20)
                            { // difference too big
                                sp.OOVbalanced  = false;
                                spI.OOVbalanced = false;
                                string auxS;
                                auxS = sp.noov.ToString();
                                foreach (WordScoreClass w in sp.WordScoresrc)
                                {
                                    if (w.matched_unrestricted) // is an OOV
                                    {
                                        auxS += " " + w.word;
                                    }
                                }
                                logOOV.LogString(auxS);
                                foreach (WordScoreClass w in spI.WordScoresrc)
                                {
                                    if (w.matched_unrestricted) // is an OOV
                                    {
                                        auxS += " " + w.word;
                                    }
                                }
                                logOOV.LogString(auxS);
                            }
                            // }
                            // I have my decision and the reject info
                            if (sp.looksfine & spI.looksfine)
                            {
                            }
                            else
                            {
                                Console.WriteLine("SRC->" + sp.rejectinfo);
                                Console.WriteLine("TGT->" + spI.rejectinfo);
                                Console.ReadLine();
                            }



                            //


                            // if (debuga) { Console.WriteLine("Intro to continue "); Console.ReadLine(); }
                            // liminf = 0.3; limsup = 100;

                            if (sp.nmatchedu >= 500000)
                            {
                                if (sp.closetoperfection <= liminf)
                                {
                                    nlinesbad += 1;
                                    punct[sp.closetoperfection]     += 1;
                                    totoov[sp.closetoperfection]    += sp.noov;
                                    totunktra[sp.closetoperfection] += sp.nunktra;
                                    totnword[sp.closetoperfection]  += sp.nsrc;
                                    Console.WriteLine(sp.sbsentence.ToString());
                                    Console.WriteLine(linesrc);
                                    Console.WriteLine(linetgt);
                                    Console.WriteLine("Intro to continue "); Console.ReadLine();
                                    ninfocurrent -= 1;
                                    if (ninfocurrent == 0)
                                    {
                                        ninfototal  += ninfo;
                                        ninfocurrent = ninfo; Console.WriteLine(ninfototal);
                                        bool aborta = false;
                                        if (aborta)
                                        {
                                            break;
                                        }
                                    }
                                }
                            }

                            // Ready for next sentence
                        }
                        Console.WriteLine(string.Format("File {0} -> {1} lines", srcfile, nlines));
                        Console.WriteLine("punct, totoov, totunktra, totnword");
                        for (int i = 0; i <= 101; i++)
                        {
                            Console.WriteLine("{0}, {1}, {2}, {3}", punct[i], totoov[i], totunktra[i], totnword[i]);
                        }
                    }
                }
            }
            catch (Exception e)
            {
                Console.WriteLine("Fatal error.");
                Console.WriteLine(String.Format("The file {0} could not be read:", srcfile));
                Console.WriteLine(e.Message);
                return;
            }
            finally
            {
                m_dbConnection.Close();
            }
            // CloseDB(CE);
            Console.WriteLine(string.Format("lines/badlines {0}/{1}", nlines, nlinesbad));

            Console.WriteLine("The End");
            // Console.ReadLine();
        }