speaker_identification.cpp

This example shows how to use the SSpeakerIDI interface.

00001 /*****************************************************************
00002  *  BSAPI Speaker Identification Example                        *
00003  *                                                               *
00004  *  Copyright   : (C) 2006-2008 by Petr Schwarz & Pavel Matejka  *
00005  *                         & Tomas Cipr                          *
00006  *                                                               *
00007  *  Email       : {schwarz,matejka,cipr}@phonexia.com            *
00008  ****************************************************************/
00009 
00010 #include <stdio.h>
00011 #include <stdlib.h>
00012 #include <stdarg.h>
00013 #include <string.h>
00014 #include <errno.h>
00015 #include <math.h>
00016 
00017 #include "bsapi.h"
00018 #include "getopt.h"
00019 
00020 #ifdef WIN32
00021   #define DIRSEP "\\"
00022 #else
00023   #define DIRSEP "/"
00024 #endif
00025 
00026 // compile time program options
00027 #define SID_MIN_LEN_TO_PROCESS    10.0f       // 10 seconds
00028 #define SID_BELOW_MIN_LEN_TEXT    "(too short)"
00029 #define SID_OUT_COLUMN_CHARS      "snlr"
00030 
00031 // default option values
00032 #define SID_DEF_WAVE_FMT          "alaw"
00033 #define SID_DEF_WAVE_EXT          "alw"
00034 #define SID_DEF_NCHANNELS         1
00035 #define SID_DEF_SCORE_SHARPNESS   0.3f
00036 
00037 // This is an error handling object. If an error occur, a message is sent to this object at first. 
00038 // Then the running function will exit with the false or 0 return value. The object also accept 
00039 // warning and logging messages.
00040 class ErrorHandler : public SErrorCallbackI 
00041 {
00042   public:
00043     ErrorHandler() : mVerbose(false) {;}
00044     virtual void BSAPI_METHOD OnTextMessage(unsigned int iId, message_type type, unsigned int messageId, char *pMessage)
00045     {
00046       switch(type)
00047       {
00048         case mtError:
00049           fprintf(stderr, "ERROR: %s - %s\n", BSAPIInterfaceId2Text(iId), pMessage);
00050           break;
00051         case mtWarning:
00052           fprintf(stderr, "WARNING: %s - %s\n", BSAPIInterfaceId2Text(iId), pMessage);
00053           break;
00054         case mtLog:
00055           LogMessage(pMessage);
00056           break;
00057       }
00058     }
00059     
00060     void LogMessage(const char *pMessage, ...)
00061     {
00062       if (mVerbose)
00063       {
00064         va_list ap;
00065         va_start(ap, pMessage);
00066         vfprintf(stderr, pMessage, ap);
00067         fprintf(stderr, "\n");
00068         va_end(ap);
00069       }
00070     }
00071     
00072     void SetVerbose(bool verbose) {mVerbose = verbose;}
00073   
00074   protected:
00075     bool mVerbose;
00076 } gErrorHandler;
00077 
00078 void help()
00079 { 
00080   puts("\n Speaker identification                                         ");
00081   printf(" %s\n", BSAPIVersion());
00082   puts(" ================================================================ ");
00083   puts("                                                                  ");
00084   puts(" USAGE: sid [options]                                             ");
00085   puts("                                                                  ");
00086   puts(" system configuration:                                            ");
00087   puts("   -c file          configuration file                            ");
00088   puts("   -m dir           model directory                               ");
00089   puts("   -a str1,str2...  active speaker models                         ");
00090   puts("   -v               verbose mode                                  ");
00091   puts("                                                                  ");
00092   puts(" input:                                                           ");
00093   puts("   -i file          input file                                    ");
00094   puts("   -l file          list of input files                           ");
00095   puts("   -d dir           input directory                               ");
00096   puts("   -e str [alw]     extension of audio files                      ");
00097   puts("   -w fmt [alaw]    waveform format (lin16, lin8, alaw, mulaw)    ");
00098   puts("   -n num [1]       number of channels in audio files             ");
00099   puts("   -p start,len     active waveform part (in seconds)             ");
00100   puts("                                                                  ");
00101   puts(" training:                                                        ");
00102   puts("   -t               enable training                               ");
00103   puts("   -g str           name of the speaker to be trained             ");
00104   puts("                                                                  ");
00105   puts(" output:                                                          ");
00106   puts("   -s file          output score file                             ");
00107   puts("   -r               produce scores for all speakers               "); 
00108   puts("   -f columns       enable column output format (see columns)     ");
00109   puts("   -z num [0.3]     score sharpness (positive number)             ");
00110   puts("                                                                  ");
00111   puts(" columns:           (columns to print are specified by string of  ");
00112   puts("                     the characters below, e.g. lsn)              ");
00113   puts("    s               raw score                                     ");
00114   puts("    n               score normalized to <0, 100>                  ");
00115   puts("    l               speech length                                 ");
00116   puts("    r               record length                                 ");
00117   puts("                                                                  "); 
00118 }
00119 
00120 inline float RescaleScore(float score, float scale = 1.0f)
00121 {
00122   return (100.0f / (1.0f + expf(-(scale * score))));
00123 }
00124 
00125 // The line format is: input_file_name speaker [optional columns]
00126 void DumpColWise(const char *pInputName, char **ppNames, float *pScores, int nSpeakers, 
00127   float recordLength, float speechLength, float sharpness, const char *pColumnFmt, FILE *pFileHandle)
00128 {
00129   for (int i = 0; i < nSpeakers; i++)
00130   {
00131     fprintf(pFileHandle, "%s %s", pInputName, ppNames[i]);
00132     for (const char *pc = pColumnFmt; *pc != '\0'; pc++)
00133     {
00134       switch (*pc)
00135       {
00136         case 's':
00137           if (speechLength < SID_MIN_LEN_TO_PROCESS)
00138             fprintf(pFileHandle, " -inf");
00139           else
00140             fprintf(pFileHandle, " %.3f", pScores[i]);
00141           break;
00142           
00143         case 'n':
00144           if (speechLength < SID_MIN_LEN_TO_PROCESS)
00145             fprintf(pFileHandle, " 0.000");
00146           else
00147             fprintf(pFileHandle, " %.3f", RescaleScore(pScores[i], sharpness));
00148           break;
00149           
00150         case 'l':
00151           fprintf(pFileHandle, " %.3f", speechLength);
00152           break;
00153           
00154         case 'r':
00155           fprintf(pFileHandle, " %.3f", recordLength);
00156           break;
00157       }
00158     }
00159     
00160     if (speechLength < SID_MIN_LEN_TO_PROCESS)
00161       fprintf(pFileHandle, " %s", SID_BELOW_MIN_LEN_TEXT);
00162     
00163     fprintf(pFileHandle, "\n");
00164   }
00165 }
00166 
00167 // The line format is: input_file_name score1 score2 ... scoreN
00168 // In addition, the first line will contain names of all speakers
00169 void DumpRowWise(const char *pInputName, char **ppNames, float *pScores, int nSpeakers, 
00170   float speechLength, float sharpness, FILE *pFileHandle)
00171 {
00172   static bool first_time = true;
00173   
00174   if (first_time)
00175   {
00176     if (nSpeakers > 0)
00177       fprintf(pFileHandle, "%s", ppNames[0]);
00178     for (int i = 1; i < nSpeakers; i++)
00179       fprintf(pFileHandle, " %s", ppNames[i]);
00180     fprintf(pFileHandle, "\n");
00181     first_time = false;
00182   }
00183 
00184   fprintf(pFileHandle, "%s", pInputName);
00185   for (int i = 0; i < nSpeakers; i++)
00186   {
00187     if (speechLength < SID_MIN_LEN_TO_PROCESS)
00188       fprintf(pFileHandle, " 0.000");
00189     else
00190       fprintf(pFileHandle, " %.3f", RescaleScore(pScores[i], sharpness));
00191   }
00192   
00193   if (speechLength < SID_MIN_LEN_TO_PROCESS)
00194     fprintf(pFileHandle, " %s", SID_BELOW_MIN_LEN_TEXT);
00195     
00196   fprintf(pFileHandle, "\n");
00197 }
00198 
00199 // This function outputs the winning speaker or scores for all of them
00200 bool DumpScore(const char *pInputName, SSpeakerIDI *psid, bool dumpAllScores, 
00201   const char *pColumnFmt, float sharpness, FILE *pFileHandle)
00202 {
00203   SWaveformFormatConvertorI *pwfconv = psid->GetWaveformFormatConvertor();
00204   if (!pwfconv)
00205     return false;
00206  
00207   float speech_length = psid->GetTestLength();
00208   float record_length = pwfconv->GetInputLength(); 
00209   
00210   if (dumpAllScores)
00211   {
00212     int num = 0;
00213     char **ppnames = psid->GetModelNames(&num);
00214     float *pscores = psid->GetModelScores(&num);
00215     
00216     // the functions above can return 0 in case of an error
00217     if (!ppnames || !pscores) 
00218       return false;
00219     
00220     // 0th array element is for UBM, skip it
00221     if (pColumnFmt)
00222     {
00223       DumpColWise(pInputName, ppnames + 1, pscores + 1, num - 1, record_length, 
00224         speech_length, sharpness, pColumnFmt, pFileHandle);
00225     }
00226     else
00227     {
00228       DumpRowWise(pInputName, ppnames + 1, pscores + 1, num - 1, speech_length, 
00229         sharpness, pFileHandle);
00230     }
00231   }
00232   else
00233   {
00234     float score = 0.0f;
00235     char *pname = psid->GetBestModel(&score);
00236     
00237     if (!pname) 
00238       return false;
00239     
00240     DumpColWise(pInputName, &pname, &score, 1, record_length, 
00241       speech_length, sharpness, (pColumnFmt ? pColumnFmt : "n"), pFileHandle);
00242   }
00243   
00244   return true;
00245 }
00246 
00247 int main(int argc, char *argv[])
00248 {
00249   // initial configuration variables
00250   char *pconfig_file   = 0;
00251   char *pmodel_dir     = 0;
00252   char *pinput_file    = 0;
00253   char *plist_file     = 0;
00254   char *pinput_dir     = 0;
00255   char *pwave_fmt      = SID_DEF_WAVE_FMT;
00256   char *pwave_ext      = SID_DEF_WAVE_EXT;
00257   char *poutput_file   = 0;
00258   char *pcolumn_fmt    = 0;
00259   char *pspeaker_name  = 0;
00260   char *pactive_speakers = 0;
00261   int  nchannels       = SID_DEF_NCHANNELS;  
00262   bool training_mode   = false;
00263   bool dump_all_scores = false;
00264   float sharpness      = SID_DEF_SCORE_SHARPNESS;
00265   float acwf_start     = 0;
00266   float acwf_len       = 0;
00267 
00268   // command line parsing
00269   if(argc == 1)
00270   {
00271     help();
00272     return 0;
00273   }
00274 
00275   optind = 0;
00276   while (1)
00277   {
00278     int c = getopt(argc, argv, const_cast<char *>("c:m:i:l:d:e:n:w:s:a:o:trf:vg:p:z:"));
00279     if(c == -1)
00280       break;
00281 
00282     switch(c)
00283     {
00284       case 'c':
00285         pconfig_file = optarg;
00286         break;
00287       case 'm':
00288         pmodel_dir = optarg;
00289         break;
00290       case 'i':
00291         pinput_file = optarg;
00292         break;
00293       case 'l':
00294         plist_file = optarg;
00295         break;
00296       case 'd':
00297         pinput_dir = optarg;
00298         break;
00299       case 'w':
00300         pwave_fmt = optarg;
00301         break;
00302       case 'e':
00303         pwave_ext = optarg;
00304         break;
00305       case 'n':
00306         if(sscanf(optarg, "%d", &nchannels) != 1 || nchannels < 1)
00307         {
00308           fprintf(stderr, "ERROR: Invalid number of channels: %s.\n", optarg);
00309           return 1;
00310         }
00311         break;
00312       case 's':
00313         poutput_file = optarg;
00314         break;
00315       case 'v':
00316         gErrorHandler.SetVerbose(true);
00317         break;
00318       case 't':
00319         training_mode = true;
00320         break;     
00321       case 'r':
00322         dump_all_scores = true;
00323         break;
00324       case 'f':
00325         pcolumn_fmt = optarg;
00326         if (strspn(pcolumn_fmt, SID_OUT_COLUMN_CHARS) != strlen(pcolumn_fmt))
00327         {
00328           fprintf(stderr, "ERROR: Wrong format of output columns string. "
00329           "The set of allowed characters is '%s'.\n", SID_OUT_COLUMN_CHARS);
00330           return 1;
00331         }
00332         break;
00333       case 'z':
00334         if(sscanf(optarg, "%f", &sharpness) != 1)
00335         {
00336           fprintf(stderr, "ERROR: Wrong value of score sharpness '%s'. Must be positive number.\n", optarg);
00337           return 1;
00338         }
00339         break;
00340       case 'g':
00341         pspeaker_name = optarg;
00342         break;
00343       case 'a':
00344         pactive_speakers = optarg;
00345         break;
00346       case 'p':
00347         if(sscanf(optarg, "%f,%f", &acwf_start, &acwf_len) != 2)
00348         {
00349           fprintf(stderr, "ERROR: Can not parse active waveform part '%s'\n", optarg);
00350           return 1;
00351         }
00352         break;
00353       case '?':
00354         fprintf(stderr, "ERROR: Command line parsing error.\n");
00355         return 1;
00356     }
00357   }
00358 
00359   // register license file for SSpeakerIDI
00360   SLicenseManagerI *plicman = BSAPIGetLicenseManager();
00361   if (plicman)
00362   {
00363     plicman->SetErrorHandler(&gErrorHandler);
00364     plicman->RegisterLicenseFile("license.dat");
00365   }
00366 
00367   // ask for an instance of speaker identification system
00368   SSpeakerIDI *psid = static_cast<SSpeakerIDI *>(BSAPICreateInstance(SIID_SPKID));
00369   if(!psid)
00370   {
00371     return 1;
00372   }
00373 
00374   // tell the instance where to send error messages
00375   psid->SetErrorHandler(&gErrorHandler);
00376   
00377   // read configuration file and configure the instance
00378   char pdefault_cfg[1024];
00379   sprintf(pdefault_cfg, "settings%sconfig", DIRSEP);  // Visual C does not have snprintf
00380   if(!psid->Init((pconfig_file ? pconfig_file : pdefault_cfg)))
00381   {
00382     psid->Release();
00383     return 1;
00384   }
00385   
00386   // Set a directory saving models of speakers.
00387   // If it is not set, a default one (pre-set in config file) is used.
00388   if(pmodel_dir && !psid->SetModelDirectory(pmodel_dir))
00389   {
00390     psid->Release();
00391     return 1;
00392   }
00393   
00394   // It is also possible to configure the waveform source.
00395   // If there are multiple channels, the channels are concatenated and just one decision made.
00396   SWaveformFormatConvertorI *pwc = psid->GetWaveformFormatConvertor();
00397   if (pwc)
00398   {
00399     pwc->SetNChannels(nchannels);
00400     pwc->SetInputFormatStr(pwave_fmt);
00401   }
00402   
00403   // Train speaker models. 
00404   // There are more possibilities of input: memory block, file, list of files, directory.
00405   // The training mode is enabled when one of AddXXX functions is entred first time.
00406   // There is possibility to train multiple speakers together. This could be requested in future for discriminative training.
00407   if(training_mode)
00408   {
00409     // verify that we have the speaker name
00410     if((pinput_file || pinput_dir) && !pspeaker_name)
00411     {
00412       fprintf(stderr, "ERROR: Training using one file or directory without knowing speaker name. Please set -g speaker\n");
00413       psid->Release();
00414       return 1;
00415     }
00416 
00417     // Get number of requested training iterations. This will return value one very likely, but the iterative
00418     // training procedure was chosen for the API to allow more difficult (discriminative) training procedures.
00419     // in future.
00420     int nreq_iters = psid->GetNRequestedTrainingIters();
00421 
00422     int i;
00423     for(i = 0; i < nreq_iters; i++)
00424     {
00425 
00426       // it is not necessary to call StartTrainingIteration if all the signal modelling subsystems 
00427       // are known to request just one iteration
00428       if(!psid->StartTrainingIteration())
00429       {
00430         psid->Release();
00431         return 1;
00432       }
00433 
00434       // the input is one file
00435       if(pinput_file && !psid->AddFile(pspeaker_name, pinput_file))
00436       {
00437         psid->Release();
00438         return 1;
00439       }
00440       
00441       // The input is listfile. Each line of the file has two columns - speaker name and waveform file.
00442       // The list can be also in memory (AddFilesFromMemList).
00443       if(plist_file && !psid->AddFilesFromListFile(plist_file))
00444       {
00445         psid->Release();
00446         return 1;
00447       }
00448 
00449       // the input is directory
00450       if(pinput_dir && !psid->AddFilesFromDirectory(pspeaker_name, pinput_dir, pwave_ext))
00451       {
00452         psid->Release();
00453         return 1;
00454       }
00455     }
00456 
00457     // It is enough to call FinishTraining when all speech files/segments are added.
00458     // The system will estimate new models. The models are ready to use in the current instance
00459     // and also saved to the actual model directory for next use.
00460     if(!psid->FinishTraining())
00461     {
00462       psid->Release();
00463       return 1;
00464     }
00465     
00466     if (pspeaker_name && (psid->GetTrainingLength(pspeaker_name) < SID_MIN_LEN_TO_PROCESS))
00467     {
00468       fprintf(stderr, "WARNING: Training record(s) contain only %.3f seconds of speech. "
00469         "At least %f seconds are needed to obtain significant results.\n",
00470         psid->GetTrainingLength(pspeaker_name), SID_MIN_LEN_TO_PROCESS);
00471     }
00472   }
00473   else
00474   {
00475     // Scoring files.
00476 
00477     // Set active waveform part. This can significantly speed up identification if the records are long.
00478     if(!psid->SetActiveWaveformPart(acwf_start, acwf_len))
00479     {
00480       psid->Release();
00481       return 1;
00482     }
00483 
00484     // It is necesary to activate some or all models before scoring.
00485     if(pactive_speakers)
00486       psid->ActivateModels(pactive_speakers);
00487     else
00488       psid->ActivateAllModels();
00489 
00490     // Just one speech file or memory block can be scored. But the library have another possibilities to simplify 
00491     // processing of listfiles and directories
00492 
00493     // open output file
00494     FILE *pf_out = stdout;
00495     if(poutput_file)
00496     {
00497       pf_out = fopen(poutput_file, "w");
00498       if(!pf_out)
00499       {
00500         fprintf(stderr, "ERROR: Can not open output score file '%s'.", poutput_file);
00501         psid->Release();
00502         return 1;
00503       }
00504     }
00505 
00506     // the input is one file
00507     if(pinput_file)
00508     {
00509       gErrorHandler.LogMessage("Processing file: %s", pinput_file);
00510       if(!psid->TestFile(pinput_file))
00511       {
00512         psid->Release();
00513         return 1;
00514       }
00515       DumpScore(pinput_file, psid, dump_all_scores, pcolumn_fmt, sharpness, pf_out);
00516     }
00517 
00518     // the input is listfile
00519     if(plist_file)
00520     {
00521       SFileListI *plist = static_cast<SFileListI *>(BSAPICreateInstance(SIID_FILELIST));
00522       if(!plist)
00523       {
00524         fprintf(stderr, "Memory allocation error.");
00525         psid->Release();
00526         return 1;
00527       }
00528       plist->SetErrorHandler(&gErrorHandler);
00529       if(!plist->AddList(plist_file))
00530       {
00531         plist->Release();
00532         psid->Release();
00533         return 1;
00534       }
00535 
00536       plist->FirstLine();
00537       char ptarget[1024];
00538       char psource[1024];
00539       int start;
00540       int end;
00541       float prob;
00542       while(plist->GetLine(ptarget, psource, &start, &end, &prob))
00543       {
00544         gErrorHandler.LogMessage("Processing file: %s", psource);
00545         if(!psid->TestFile(psource))
00546         {
00547           plist->Release();
00548           psid->Release();
00549           return 1;
00550         }
00551         DumpScore(psource, psid, dump_all_scores, pcolumn_fmt, sharpness, pf_out);
00552       }
00553       plist->Release();
00554     }
00555 
00556     // the input is a directory
00557     // The FileSniffer can monitor a directory for incoming files. Here, the dictionary is searched just once.
00558     if(pinput_dir)
00559     {
00560       SFileSnifferI *psniffer = static_cast<SFileSnifferI *>(BSAPICreateInstance(SIID_FILESNIFFER));
00561       if(!psniffer)
00562       {
00563         fprintf(stderr, "Memory allocation error.");
00564         psid->Release();
00565         return 1;
00566       }
00567       psniffer->SetErrorHandler(&gErrorHandler);
00568       psniffer->AddDirectory(pinput_dir);
00569       psniffer->AddWantedSuffix(pwave_ext);
00570 
00571       if(!psniffer->FirstFile())
00572       {
00573         psniffer->Release();
00574         psid->Release();
00575         return 1;
00576       }
00577       
00578       char psource[1024];
00579       while(psniffer->GetFile(psource, sizeof(psource) - 1))
00580       {
00581         gErrorHandler.LogMessage("Processing file: %s", psource);
00582         if(!psid->TestFile(psource))
00583         {
00584           psniffer->Release();
00585           psid->Release();
00586           return 1;
00587         }
00588         DumpScore(psource, psid, dump_all_scores, pcolumn_fmt, sharpness, pf_out);
00589       }
00590       psniffer->Release();
00591     }
00592 
00593     // close output file
00594     if(pf_out != stdout)
00595       fclose(pf_out);
00596   }
00597 
00598   // release the SID instance
00599   psid->Release();
00600 
00601   return 0;
00602 }

Generated on Wed Jul 15 10:10:04 2009 for BSAPI by  doxygen 1.4.7