vpextract.cpp

This example shows how to use the SVoicePrintExtractorI interface for speaker identification.

00001 /*****************************************************************
00002  *  BSAPI Voice-print Extractor Example                          *
00003  *                                                               *
00004  *  Author      : Petr Schwarz, Pavel Matejka, Tomas Cipr        *
00005  *  Copyright   : (C) 2006-2012 by Phonexia s.r.o                *
00006  *                                                               *
00007  *  For more info, please contact us at support@phonexia.com     *
00008  *****************************************************************/
00009 
00010 #include <stdio.h>
00011 #include <stdlib.h>
00012 #include <stdarg.h>
00013 #include <string.h>
00014 #include <assert.h>
00015 #include <errno.h>
00016 #include <math.h>
00017 
00018 #include "bsapi.h"
00019 #include "getopt.h"
00020 
00021 #ifdef WIN32
00022   #define DIRSEP "\\"
00023 #else
00024   #define DIRSEP "/"
00025 #endif
00026 
00027 // default option values
00028 #define SID_DEF_IN_DATA_TYPE      "waveform"
00029 #define SID_DEF_WAVE_FMT          "lin16"
00030 #define SID_DEF_WAVE_EXT          "raw"
00031 #define SID_DEF_FEA_FMT           "htk"
00032 #define SID_DEF_NCHANNELS         1
00033 #define SID_DEF_VPRINT_EXT        "vp"
00034 
00035 // This is an error handling object. If an error occurs, a message is sent to this object at first.
00036 // Then the running function will exit with the false or 0 return value. The object also accepts
00037 // warning and logging messages.
00038 class ErrorHandler : public SErrorCallbackI 
00039 {
00040   public:
00041     ErrorHandler() : mVerbose(false) {;}
00042     virtual void BSAPI_METHOD OnTextMessage(SUnknownI *pSender, message_type type, unsigned int messageId, const char *pMessage)
00043     {
00044       unsigned int iid = pSender ? pSender->GetIID() : SIID_UNDEFINED;
00045       switch(type)
00046       {
00047         case mtError:
00048           fprintf(stderr, "ERROR: %s - %s\n", BSAPIInterfaceId2Text(iid), pMessage);
00049           break;
00050         case mtWarning:
00051           fprintf(stderr, "WARNING: %s - %s\n", BSAPIInterfaceId2Text(iid), pMessage);
00052           break;
00053         case mtLog:
00054           LogMessage(pMessage);
00055           break;
00056       }
00057     }
00058 
00059     void LogMessage(const char *pMessage, ...)
00060     {
00061       if (mVerbose)
00062       {
00063         va_list ap;
00064         va_start(ap, pMessage);
00065         vfprintf(stderr, pMessage, ap);
00066         fprintf(stderr, "\n");
00067         va_end(ap);
00068       }
00069     }
00070 
00071     void SetVerbose(bool verbose) {mVerbose = verbose;}
00072 
00073   protected:
00074     bool mVerbose;
00075 } gErrorHandler;
00076 
00077 void help()
00078 {
00079   puts("\n Voice-print extractor                                          ");
00080   printf(" %s\n", BSAPIVersion());
00081   puts(" ================================================================ ");
00082   puts("                                                                  ");
00083   puts(" USAGE: vpextract [options]                                       ");
00084   puts("                                                                  ");
00085   puts(" system configuration:                                            ");
00086   puts("   -c file           configuration file                           ");
00087   puts("   -v                verbose mode                                 ");
00088   puts("                                                                  ");
00089   puts(" input:                                                           ");
00090   puts("   -i file           input file                                   ");
00091   puts("   -l file           list of input files                          ");
00092   puts("   -d dir            input directory                              ");
00093   puts("   -e str [raw]      extension of input files                     ");
00094   puts("   -s str [waveform] input data type (waveform, features)         ");
00095   puts("   -k file           use calibration file                         ");
00096   puts("                                                                  ");
00097   puts(" input data type = waveform:                                      ");
00098   puts("   -w fmt [lin16]    waveform format (lin16, lin8, alaw, mulaw)   ");
00099   puts("   -n num [1]        number of channels in audio files            ");
00100   puts("   -p start,len      active waveform part (in seconds)            ");
00101   puts("                                                                  ");
00102   puts(" input data type = features:                                      ");
00103   puts("   -f fmt [htk]      feature format (ascii, binary, htk)          ");
00104   puts("                                                                  ");
00105   puts(" output:                                                          ");
00106   puts("   -o file           output file                                  ");
00107   puts("   -h dir            output directory                             ");
00108   puts("   -x str [vp]       extension of voice-print files               ");
00109   puts("   -y file           make calibration file instead of voice-prints");
00110   puts("                                                                  ");
00111   puts(" diarization options:                                             ");
00112   puts("   -t num            total number of speakers                     ");
00113   puts("   -m num [6]        maximal number of speakers                   ");
00114   puts("   -a num [0.75]     constant sets the number of speakers         ");
00115   puts("                                                                  ");
00116 }
00117 
00118 int main(int argc, char *argv[])
00119 {
00120   // initial configuration variables
00121   const char *pconfig_file      = "settings/extract.bs";
00122   const char *pinput_file       = 0;
00123   const char *plist_file        = 0;
00124   const char *pinput_dir        = 0;
00125   const char *pin_data_type     = SID_DEF_IN_DATA_TYPE;
00126   const char *pwave_fmt         = SID_DEF_WAVE_FMT;
00127   const char *pwave_ext         = SID_DEF_WAVE_EXT;
00128   const char *pfea_fmt          = SID_DEF_FEA_FMT;
00129   const char *poutput_file      = 0;
00130   const char *poutput_dir       = 0;
00131   const char *pvprint_ext       = SID_DEF_VPRINT_EXT;
00132   const char *pin_calib_file    = 0;
00133   const char *pout_calib_file   = 0;
00134   int nchannels                 = SID_DEF_NCHANNELS;
00135   int total_speakers            = 0;
00136   int max_speakers              = 0;
00137   float acwf_start              = 0.0f;
00138   float acwf_len                = 0.0f;
00139   float max_avg_distance        = 0.0f;
00140 
00141   // command line parsing
00142   if(argc == 1)
00143   {
00144     help();
00145     return 0;
00146   }
00147 
00148   optind = 0;
00149   while (1)
00150   {
00151     int c = getopt(argc, argv, const_cast<char *>("c:i:l:d:s:e:w:n:p:f:o:h:x:k:y:t:m:a:v"));
00152     if(c == -1)
00153       break;
00154 
00155     switch(c)
00156     {
00157       case 'c':
00158         pconfig_file = optarg;
00159         break;
00160 
00161       case 'i':
00162         pinput_file = optarg;
00163         break;
00164       case 'l':
00165         plist_file = optarg;
00166         break;
00167       case 'd':
00168         pinput_dir = optarg;
00169         break;
00170       case 's':
00171         pin_data_type = optarg;
00172         break;
00173       case 'w':
00174         pwave_fmt = optarg;
00175         break;
00176       case 'e':
00177         pwave_ext = optarg;
00178         break;
00179       case 'n':
00180         if(sscanf(optarg, "%d", &nchannels) != 1 || nchannels < 1)
00181         {
00182           fprintf(stderr, "ERROR: Invalid number of channels: %s.\n", optarg);
00183           return 1;
00184         }
00185         break;
00186       case 'p':
00187         if(sscanf(optarg, "%f,%f", &acwf_start, &acwf_len) != 2)
00188         {
00189           fprintf(stderr, "ERROR: Can not parse active waveform part '%s'\n", optarg);
00190           return 1;
00191         }
00192         break;
00193       case 'f':
00194         pfea_fmt = optarg;
00195         break;
00196       case 'o':
00197         poutput_file = optarg;
00198         break;
00199       case 'h':
00200         poutput_dir = optarg;
00201         break;
00202       case 'v':
00203         gErrorHandler.SetVerbose(true);
00204         break;
00205       case 'x':
00206         pvprint_ext = optarg;
00207         break;
00208       case 'k':
00209         pin_calib_file = optarg;
00210         break;
00211       case 'y':
00212         pout_calib_file = optarg;
00213         break;
00214       case 't':
00215         if(sscanf(optarg, "%d", &total_speakers) != 1 || total_speakers < 0)
00216         {
00217           fprintf(stderr, "ERROR: Invalid total number of speakers '%s'. "
00218                           "Must be positive integer or 0.\n", optarg);
00219           return 1;
00220         }
00221         break;
00222       case 'm':
00223         if(sscanf(optarg, "%d", &max_speakers) != 1 || max_speakers < 0)
00224         {
00225           fprintf(stderr, "ERROR: Invalid maximal number of speakers '%s'. "
00226                           "Must be positive integer or 0.\n", optarg);
00227           return 1;
00228         }
00229         break;
00230       case 'a':
00231         if(sscanf(optarg, "%f", &max_avg_distance) != 1 || max_avg_distance < 0)
00232         {
00233           fprintf(stderr, "ERROR: Invalid constant sets the number of speakers  '%s'. "
00234                           "Must be positive floating point number or 0.\n", optarg);
00235           return 1;
00236         }
00237         break;
00238       case '?':
00239         fprintf(stderr, "ERROR: Command line parsing error.\n");
00240         return 1;
00241       default :
00242         fprintf(stderr, "ERROR: Command line parsing error. Unexpected argument '%s'.\n", optarg);
00243         return 1;
00244     }
00245   }
00246 
00247   // register license file for SSpeakerIDI
00248   SLicenseManagerI *plicman = BSAPIGetLicenseManager();
00249   if (plicman)
00250   {
00251     plicman->SetErrorHandler(&gErrorHandler);
00252     plicman->RegisterLicenseFile("license.dat");
00253   }
00254 
00255   // ask for an instance of voice-print extractor
00256   SVoicePrintExtractorI *pvp_extract = static_cast<SVoicePrintExtractorI *>(BSAPICreateInstance(SIID_VPRINTEXTRACTOR));
00257   if(!pvp_extract)
00258   {
00259     return 1;
00260   }
00261 
00262   // tell the instance where to send error messages
00263   pvp_extract->SetErrorHandler(&gErrorHandler);
00264 
00265   // read configuration file and configure the instance
00266   char pdefault_cfg[1024];
00267   sprintf(pdefault_cfg, "settings%sconfig", DIRSEP);  // Visual C does not have snprintf
00268   if(!pvp_extract->Init((pconfig_file ? pconfig_file : pdefault_cfg)))
00269   {
00270     pvp_extract->Release();
00271     return 1;
00272   }
00273 
00274   SBlockSetI *pbset = pvp_extract->GetBlockSet();
00275   if(!pbset)
00276   {
00277     pvp_extract->Release();
00278     return 1;
00279   }
00280 
00281   SWaveformFormatConvertorI *pwc = static_cast<SWaveformFormatConvertorI *>(pbset->GetBlock("waveform_convertor"));
00282   if(!pwc)
00283   {
00284     pvp_extract->Release();
00285     return 1;
00286   }
00287 
00288   pwc->SetNChannels(nchannels);
00289   if (!pwc->SetInputFormatStr(pwave_fmt))
00290   {
00291     pvp_extract->Release();
00292     return 1;
00293   }
00294 
00295   if(pbset->Exists("feature_source"))
00296   {
00297     SFeatureSourceI *pfeature_source = static_cast<SFeatureSourceI *>(pbset->GetBlock("feature_source"));
00298     if(!pfeature_source)
00299     {
00300       pvp_extract->Release();
00301       return 1;
00302     }
00303 
00304     if (!pfeature_source->SetFileFormatStr(pfea_fmt))
00305     {
00306       pvp_extract->Release();
00307       return 1;
00308     }
00309   }
00310 
00311   // set active waveform part, this can significantly speed up identification if the records are long
00312   if(acwf_len > 0.0f)
00313   {
00314     SActiveWaveformPartSelectorI *pacwf_sel = static_cast<SActiveWaveformPartSelectorI *>(pbset->GetBlock("param.waveform_part_selector"));
00315     if(!pacwf_sel)
00316     {
00317       pvp_extract->Release();
00318       return 1;
00319     }
00320 
00321     pacwf_sel->SetSelectionStartSec(acwf_start);
00322     pacwf_sel->SetSelectionLengthSec(acwf_len);
00323   }
00324 
00325   if(pbset->Exists("diar.quantizer"))
00326   {
00327     SVectorQuantizerI *pquantizer = static_cast<SVectorQuantizerI *>(pbset->GetBlock("diar.quantizer"));
00328     if(!pquantizer)
00329     {
00330       pvp_extract->Release();
00331       return 1;
00332     }
00333 
00334     if(total_speakers)   pquantizer->SetTotalClusters(total_speakers);
00335     if(max_avg_distance) pquantizer->SetMaxAvgDistance(max_avg_distance);
00336     if(max_speakers)     pquantizer->SetMaxClusters(max_speakers);
00337   }
00338   else
00339   {
00340     if(total_speakers != 0 || max_speakers != 0 || max_avg_distance != 0.0f)
00341       fprintf(stderr, "WARNING: Diarization options were not applied. Config file '%s' does not support diarization. \n", (pconfig_file ? pconfig_file : pdefault_cfg));
00342   }
00343 
00344   // if user calibration is turned on load calibration parameters and/or start estimation
00345   SUserCalibrationI *pcalib = 0;
00346   if(pin_calib_file || pout_calib_file)
00347   {
00348     pcalib = static_cast<SUserCalibrationI *>(pbset->GetBlock("user_calibration"));
00349     if(!pcalib)
00350     {
00351       pvp_extract->Release();
00352       return 1;
00353     }
00354 
00355     if(pout_calib_file)
00356     {
00357       pcalib->StartEstimation();
00358     }
00359     else // pin_calib_file
00360     {
00361       pcalib->SetEnabled(true);
00362       if(!pcalib->Load(pin_calib_file))
00363       {
00364         pvp_extract->Release();
00365         return 1;
00366       }
00367     }
00368   }
00369 
00370   if(!pvp_extract->SetInputDataTypeStr(pin_data_type))
00371   {
00372     pvp_extract->Release();
00373     return 1;
00374   }
00375 
00376   // process one file
00377   if(pinput_file && !pvp_extract->ProcessFile(pinput_file, poutput_file))
00378   {
00379     pvp_extract->Release();
00380     return 1;
00381   }
00382 
00383   // Set output directory and voice-print file suffix. In the file list, this directory is used if the output file name is not specified.
00384   pvp_extract->SetOutputDirectory(poutput_dir);
00385   pvp_extract->SetWaveformFileSuffix(pwave_ext);
00386   pvp_extract->SetVoicePrintFileSuffix(pvprint_ext);
00387 
00388 
00389   // process list of files
00390   if(plist_file && !pvp_extract->ProcessList(plist_file))
00391   {
00392     pvp_extract->Release();
00393     return 1;
00394   }
00395 
00396   if(pinput_dir && !pvp_extract->ProcessDirectory(pinput_dir))
00397   {
00398     pvp_extract->Release();
00399     return 1;
00400   }
00401 
00402   // end calibration estimation and save calibration parameters
00403   if(pout_calib_file)
00404   {
00405     assert(pcalib);
00406     pcalib->EndEstimation();
00407     if (!pcalib->Save(pout_calib_file))
00408     {
00409       pvp_extract->Release();
00410       return 1;
00411     }
00412   }
00413 
00414   pvp_extract->Release();
00415   return 0;
00416 }

Generated on Wed Apr 11 10:00:17 2012 for BSAPI by  doxygen 1.4.7