language_identification.cpp

This example shows how to use the SLIDI interface.

00001 /*****************************************************************
00002  *  BSAPI Language Identification Example                        *
00003  *                                                               *
00004  *  Copyright   : (C) 2006-2008 by Petr Schwarz & Pavel Matejka  *
00005  *                         & Tomas Cipr                          *
00006  *                                                               *
00007  *  Email       : {schwarz,matejka,cipr}@phonexia.com            *
00008  ****************************************************************/
00009 
00010 #include <stdio.h>
00011 #include <stdlib.h>
00012 #include <errno.h>
00013 #include <stdarg.h>
00014 #include <dirent.h>
00015 
00016 #include "bsapi.h"
00017 #include "getopt.h"
00018 
00019 #ifdef WIN32
00020   #define DIRSEP "\\"
00021 #else
00022   #define DIRSEP "/"
00023 #endif
00024 
00025 #define LID_MIN_LEN_TO_PROCESS    2.0f       // 2 seconds
00026 #define LID_BELOW_MIN_LEN_TEXT    "(too_short)"
00027 
00028 // This is an error handling object. If an error occur, a message is sent to this object at first. 
00029 // Then the running function will exit with the false or 0 return value. The object also accept 
00030 // warning and logging messages.
00031 class ErrorHandler : public SErrorCallbackI 
00032 {
00033   public:
00034     ErrorHandler() : mVerbose(false) {;}
00035     virtual void BSAPI_METHOD OnTextMessage(unsigned int iId, message_type type, unsigned int messageId, char *pMessage)
00036     {
00037       switch(type)
00038       {
00039         case mtError:
00040           fprintf(stderr, "ERROR: %s - %s\n", BSAPIInterfaceId2Text(iId), pMessage);
00041           break;
00042         case mtWarning:
00043           fprintf(stderr, "WARNING: %s - %s\n", BSAPIInterfaceId2Text(iId), pMessage);
00044           break;
00045         case mtLog:
00046           LogMessage(pMessage);
00047           break;
00048       }
00049     }
00050 
00051     void LogMessage(const char *pMessage, ...)
00052     {
00053       if (mVerbose)
00054       {
00055         va_list ap;
00056         va_start(ap, pMessage);
00057         vfprintf(stderr, pMessage, ap);
00058         fprintf(stderr, "\n");
00059         va_end(ap);
00060       }
00061     }
00062     
00063     void SetVerbose(bool verbose) {mVerbose = verbose;}
00064   
00065   protected:
00066     bool mVerbose;
00067 } gErrorHandler;
00068 
00069 void help()
00070 { 
00071   puts("\n Language identification                                        ");
00072   printf(" %s\n", BSAPIVersion());
00073   puts(" ================================================================ ");
00074   puts("                                                                  ");
00075   puts(" USAGE: lid [options]                                             ");
00076   puts("                                                                  ");
00077   puts(" system configuration:                                            ");
00078   puts("   -c file         configuration file                             ");
00079   puts("   -m dir          model directory                                ");
00080   puts("   -a str1,str2... active languages                               ");
00081   puts("                                                                  ");
00082   puts(" input:                                                           ");
00083   puts("   -i file         input file                                     ");
00084   puts("   -l file         list of input files                            ");
00085   puts("   -d dir          input directory                                ");
00086   puts("   -e str [raw]    extension of audio files                       ");
00087   puts("   -w fmt [lin16]  waveform format (lin16, lin8, alaw, mulaw)     ");
00088   puts("   -n num [1]      number of channels in audio files              ");
00089   puts("   -p start,len    active waveform part (in seconds)              ");
00090   puts("                                                                  ");
00091   puts(" output:                                                          ");
00092   puts("   -s file         output score file                              ");
00093   puts("   -r              produce scores for all languages               "); 
00094   puts("   -u              add even the UBM score                         ");
00095   puts("   -v              verbose mode                                   ");
00096   puts("                                                                  ");
00097   puts(" training:                                                        ");
00098   puts("   -t              enable training                                ");
00099   puts("   -g str          name of language to be trained                 ");
00100   puts("                                                                  ");
00101 }
00102 
00103 // This function outputs the winning language or scores for all of them
00104 bool DumpScore(char *pInputName, SLIDI *plid, bool dumpAllScores, bool addUBMScore, FILE *pFileHandle)
00105 {
00106   // Dump scores for all languages. 
00107   // The line format is: input_file_name score1 score2 ... scoreN
00108   // In addition, the first line will contain names of all languages
00109   if(dumpAllScores)
00110   {
00111     static bool first_time = true;
00112     int i;
00113     int num;
00114     if(first_time)
00115     {
00116 
00117       char **pplang_names = plid->GetModelNames(&num);   // this function can return 0 in case of an error
00118       if(!pplang_names) 
00119         return false;
00120       if(addUBMScore)
00121         fprintf(pFileHandle, "UBM");      
00122       for(i = 1; i < num; i++)      // index 0 is reserved to universal background model
00123       {
00124         if(i == 1 && !addUBMScore)
00125           fprintf(pFileHandle, "%s", pplang_names[i]);
00126         else
00127           fprintf(pFileHandle, " %s", pplang_names[i]);
00128       }
00129       fprintf(pFileHandle, "\n");
00130       first_time = false;
00131     }
00132     float *pscores = plid->GetModelScores(&num);
00133     if(!pscores)
00134       return false;
00135     fprintf(pFileHandle, "%s", pInputName);
00136     if (plid->GetTestLength() < LID_MIN_LEN_TO_PROCESS)
00137     {
00138       if(addUBMScore)
00139         fprintf(pFileHandle, " -inf");
00140       for(i = 1; i < num; i++)                           
00141         fprintf(pFileHandle, " -inf");
00142       fprintf(pFileHandle, " %s", LID_BELOW_MIN_LEN_TEXT);
00143     }
00144     else
00145     {
00146       if(addUBMScore)
00147         fprintf(pFileHandle, " %f", pscores[0]);
00148       for(i = 1; i < num; i++)                           
00149         fprintf(pFileHandle, " %f", pscores[i]);
00150     }     
00151     fprintf(pFileHandle, "\n");
00152   }
00153   // Dump just the winning language
00154   // The line format is: input_file_name winning_language score
00155   else
00156   {
00157     float score;
00158     char *pwinning_lang = plid->GetBestModel(&score);    
00159     if(!pwinning_lang)
00160       return false;
00161     if (plid->GetTestLength() < LID_MIN_LEN_TO_PROCESS)
00162       fprintf(pFileHandle, "%s %s -inf", pInputName, LID_BELOW_MIN_LEN_TEXT);
00163     else
00164       fprintf(pFileHandle, "%s %s %f", pInputName, pwinning_lang, score);
00165     fprintf(pFileHandle, "\n");
00166   }
00167   return true;
00168 }
00169 
00170 bool DirectoryExists(const char *pPath)
00171 {
00172   DIR *pdir = opendir(pPath);
00173   if (!pdir)
00174     return false;
00175 
00176   closedir(pdir);
00177   return true;
00178 }
00179 
00180 int main(int argc, char *argv[])
00181 {
00182   // initial configuration variables
00183   char *pconfig_file   = 0;
00184   char *pmodel_dir     = 0;
00185   char *pinput_file    = 0;
00186   char *plist_file     = 0;
00187   char *pinput_dir     = 0;
00188   char *pwave_fmt      = "lin16";
00189   char *pwave_ext      = "raw";
00190   char *poutput_file   = 0;
00191   char *plang_name     = 0;
00192   char *pactive_langs  = 0;
00193   int  nchannels       = 1;  
00194   bool training_mode   = false;
00195   bool dump_all_scores = false;
00196   bool add_ubm_score   = false;
00197   float acwf_start     = 0;
00198   float acwf_len       = 0;
00199 
00200   // command line parsing
00201   if(argc == 1)
00202   {
00203     help();
00204     return 0;
00205   }
00206 
00207   optind = 0;
00208   while (1)
00209   {
00210     int c = getopt(argc, argv, const_cast<char *>("-c:m:i:l:d:e:n:w:s:a:o:trvg:up:"));
00211     if(c == -1)
00212       break;
00213 
00214     switch(c)
00215     {
00216       case 'c':
00217         pconfig_file = optarg;
00218         break;
00219       case 'm':
00220         pmodel_dir = optarg;
00221         break;
00222       case 'i':
00223         pinput_file = optarg;
00224         break;
00225       case 'l':
00226         plist_file = optarg;
00227         break;
00228       case 'd':
00229         pinput_dir = optarg;
00230         break;
00231       case 'w':
00232         pwave_fmt = optarg;
00233         break;
00234       case 'e':
00235         pwave_ext = optarg;
00236         break;
00237       case 'n':
00238         if(sscanf(optarg, "%d", &nchannels) != 1 || nchannels < 1)
00239         {
00240           fprintf(stderr, "ERROR: Invalid number of channels: %s.\n", optarg);
00241           return 1;
00242         }
00243         break;
00244       case 's':
00245         poutput_file = optarg;
00246         break;
00247       case 'v':
00248         gErrorHandler.SetVerbose(true);
00249         break;
00250       case 't':
00251         training_mode = true;
00252         break;     
00253       case 'r':
00254         dump_all_scores = true;
00255         break;
00256       case 'u':
00257         add_ubm_score = true;
00258         break;
00259       case 'g':
00260         plang_name = optarg;
00261         break;
00262       case 'a':
00263         pactive_langs = optarg;
00264         break;
00265       case 'p':
00266         if(sscanf(optarg, "%f,%f", &acwf_start, &acwf_len) != 2)
00267         {
00268           fprintf(stderr, "ERROR: Can not parse active waveform part '%s'\n", optarg);
00269           return 1;
00270         }
00271         break;
00272       case '?':
00273         fprintf(stderr, "ERROR: Command line parsing error.\n");
00274         return 1;
00275     }
00276   }
00277 
00278   // register license file for SLIDI
00279   SLicenseManagerI *plicman = BSAPIGetLicenseManager();
00280   if (plicman)
00281   {
00282     plicman->SetErrorHandler(&gErrorHandler);
00283     plicman->RegisterLicenseFile("license.dat");
00284   }
00285   
00286   // ask for an instance of language identification system
00287   SLIDI *plid = static_cast<SLIDI *>(BSAPICreateInstance(SIID_LID));
00288   if(!plid)
00289   {
00290     return 1;
00291   }
00292 
00293   // tell the instance where to send error messages
00294   plid->SetErrorHandler(&gErrorHandler);
00295   
00296   // read configuration file and configure the instance
00297   char pdefault_cfg[1024];
00298   sprintf(pdefault_cfg, "settings%ssigmodelling", DIRSEP);  // Visual C does not have snprintf
00299   if(!plid->Init((pconfig_file ? pconfig_file : pdefault_cfg)))
00300   {
00301     plid->Release();
00302     return 1;
00303   }
00304   
00305   // It is also possible to configure the waveform source.
00306   // If there are multiple channels, the channels are concatenated and just one decision made.
00307   SWaveformFormatConvertorI *pwc = plid->GetWaveformFormatConvertor();
00308   if (pwc)
00309   {
00310     pwc->SetNChannels(nchannels);
00311     pwc->SetInputFormatStr(pwave_fmt);
00312   }
00313 
00314   // Train language models. 
00315   // There are more possibilities of input: memory block, file, list of files, directory.
00316   // The training mode is enabled when one of AddXXX functions is entred first time.
00317   // There is possibility to train multiple languages together. This could be requested in future for discriminative training.
00318   if(training_mode)
00319   {
00320     // Set a directory saving models of languages.
00321     // If it is not set, a default one (pre-set in config file) is used.
00322     if(pmodel_dir && !plid->SetModelDirectory(pmodel_dir))
00323     {
00324       plid->Release();
00325       return 1;
00326     }
00327 
00328     // verify that we have the language name
00329     if((pinput_file || pinput_dir) && !plang_name)
00330     {
00331       fprintf(stderr, "ERROR: Training using one file or directory without knowing language name. Please set -g language\n");
00332       plid->Release();
00333       return 1;
00334     }
00335 
00336     // Get number of requested training iterations. This will return value one very likly, but the iterative
00337     // training procedure was chosen for the API to allow more difficult (discriminative) training procedures.
00338     // in future.
00339     int nreq_iters = plid->GetNRequestedTrainingIters();
00340 
00341     int i;
00342     for(i = 0; i < nreq_iters; i++)
00343     {
00344 
00345       // it is not necessary to call StartTrainingIteration if all the signal modelling subsystems 
00346       // are known to request just one iteration
00347       if(!plid->StartTrainingIteration())
00348       {
00349         plid->Release();
00350         return 1;
00351       }
00352 
00353       // the input is one file
00354       if(pinput_file && !plid->AddFile(plang_name, pinput_file))
00355       {
00356         plid->Release();
00357         return 1;
00358       }
00359       
00360       // The input is listfile. Each line of the file has two columns - language name and waveform file.
00361       // The list can be also in memory (AddFilesFromMemList).
00362       if(plist_file && !plid->AddFilesFromListFile(plist_file))
00363       {
00364         plid->Release();
00365         return 1;
00366       }
00367 
00368       // the input is directory
00369       if(pinput_dir && !plid->AddFilesFromDirectory(plang_name, pinput_dir, pwave_ext))
00370       {
00371         plid->Release();
00372         return 1;
00373       }
00374     }
00375 
00376     // It is enough to call FinishTraining when all speech files/segments are added.
00377     // The system will estimate new models. The models are ready to use in the current instance
00378     // and also saved to the actual model directory for next use.
00379     if(!plid->FinishTraining())
00380     {
00381       plid->Release();
00382       return 1;
00383     }
00384 
00385     if (plang_name && (plid->GetTrainingLength(plang_name) < LID_MIN_LEN_TO_PROCESS))
00386     {
00387       fprintf(stderr, "WARNING: Training record(s) contain only %.3f seconds of speech. "
00388         "At least %f seconds are needed to obtain significant results.\n",
00389         plid->GetTrainingLength(plang_name), LID_MIN_LEN_TO_PROCESS);
00390     }
00391   }
00392   else
00393   {
00394     // Scoring files.
00395 
00396     if (pmodel_dir && !DirectoryExists(pmodel_dir))
00397     {
00398       fprintf(stderr, "ERROR: Model directory '%s' does not exist.\n", pmodel_dir);
00399       plid->Release();
00400       return 1;
00401     }
00402 
00403     // Set a directory from which language models will be loaded.
00404     // If it is not set, a default one (pre-set in config file) is used.
00405     if(pmodel_dir && !plid->SetModelDirectory(pmodel_dir))
00406     {
00407       plid->Release();
00408       return 1;
00409     }
00410 
00411     // Set active waveform part. This can significantlu speed up identification if the records are long.
00412     if(!plid->SetActiveWaveformPart(acwf_start, acwf_len))
00413     {
00414       plid->Release();
00415       return 1;
00416     }
00417 
00418     // It is necesary to activate some or all models before scoring.
00419     if(pactive_langs)
00420       plid->ActivateModels(pactive_langs);
00421     else
00422       plid->ActivateAllModels();
00423 
00424     // Just one speech file or memory block can be scored. But the library have another possibilities to simplify 
00425     // processing of listfiles and directories
00426 
00427     // open output file
00428     FILE *pf_out = stdout;
00429     if(poutput_file)
00430     {
00431       pf_out = fopen(poutput_file, "w");
00432       if(!pf_out)
00433       {
00434         fprintf(stderr, "ERROR: Can not open output score file '%s'.", poutput_file);
00435         plid->Release();
00436         return 1;
00437       }
00438     }
00439 
00440     // the input is one file
00441     if(pinput_file)
00442     {
00443       gErrorHandler.LogMessage("Processing file: %s", pinput_file);
00444       if(!plid->TestFile(pinput_file))
00445       {
00446         plid->Release();
00447         return 1;
00448       }
00449       DumpScore(pinput_file, plid, dump_all_scores, add_ubm_score, pf_out);
00450     }
00451 
00452     // the input is listfile
00453     if(plist_file)
00454     {
00455       SFileListI *plist = static_cast<SFileListI *>(BSAPICreateInstance(SIID_FILELIST));
00456       if(!plist)
00457       {
00458         fprintf(stderr, "Memory allocation error.");
00459         plid->Release();
00460         return 1;
00461       }
00462       plist->SetErrorHandler(&gErrorHandler);
00463       if(!plist->AddList(plist_file))
00464       {
00465         plist->Release();
00466         plid->Release();
00467         return 1;
00468       }
00469 
00470       plist->FirstLine();
00471       char ptarget[1024];
00472       char psource[1024];
00473       int start;
00474       int end;
00475       float prob;
00476       while(plist->GetLine(ptarget, psource, &start, &end, &prob))
00477       {
00478         gErrorHandler.LogMessage("Processing file: %s", psource);
00479         if(!plid->TestFile(psource))
00480         {
00481           plist->Release();
00482           plid->Release();
00483           return 1;
00484         }
00485         DumpScore(psource, plid, dump_all_scores, add_ubm_score, pf_out);
00486       }
00487       plist->Release();
00488     }
00489 
00490     // the input is a directory
00491     // The FileSniffer can monitor a directory for incoming files. Here, the dictionary is searched just once.
00492     if(pinput_dir)
00493     {
00494       SFileSnifferI *psniffer = static_cast<SFileSnifferI *>(BSAPICreateInstance(SIID_FILESNIFFER));
00495       if(!psniffer)
00496       {
00497         fprintf(stderr, "Memory allocation error.");
00498         plid->Release();
00499         return 1;
00500       }
00501       psniffer->SetErrorHandler(&gErrorHandler);
00502       psniffer->AddDirectory(pinput_dir);
00503       psniffer->AddWantedSuffix(pwave_ext);
00504 
00505       if(!psniffer->FirstFile())
00506       {
00507         psniffer->Release();
00508         plid->Release();
00509         return 1;
00510       }
00511       
00512       char psource[1024];
00513       while(psniffer->GetFile(psource, sizeof(psource) - 1))
00514       {
00515         gErrorHandler.LogMessage("Processing file: %s", psource);
00516         if(!plid->TestFile(psource))
00517         {
00518           psniffer->Release();
00519           plid->Release();
00520           return 1;
00521         }
00522         DumpScore(psource, plid, dump_all_scores, add_ubm_score, pf_out);
00523       }
00524       psniffer->Release();
00525     }
00526 
00527     // close output file
00528     if(pf_out != stdout)
00529       fclose(pf_out);
00530   }
00531 
00532   // release the LID instance
00533   plid->Release();
00534 
00535   return 0;
00536 }

Generated on Wed Jul 15 10:10:04 2009 for BSAPI by  doxygen 1.4.7