00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #include <stdio.h>
00011 #include <stdlib.h>
00012 #include <stdarg.h>
00013 #include <string.h>
00014 #include <assert.h>
00015 #include <errno.h>
00016 #include <math.h>
00017
00018 #include "bsapi.h"
00019 #include "getopt.h"
00020
00021 #ifdef WIN32
00022 #define DIRSEP "\\"
00023 #else
00024 #define DIRSEP "/"
00025 #endif
00026
00027
00028 #define SID_DEF_IN_DATA_TYPE "waveform"
00029 #define SID_DEF_WAVE_FMT "lin16"
00030 #define SID_DEF_WAVE_EXT "raw"
00031 #define SID_DEF_FEA_FMT "htk"
00032 #define SID_DEF_NCHANNELS 1
00033 #define SID_DEF_VPRINT_EXT "vp"
00034
00035
00036
00037
00038 class ErrorHandler : public SErrorCallbackI
00039 {
00040 public:
00041 ErrorHandler() : mVerbose(false) {;}
00042 virtual void BSAPI_METHOD OnTextMessage(SUnknownI *pSender, message_type type, unsigned int messageId, const char *pMessage)
00043 {
00044 unsigned int iid = pSender ? pSender->GetIID() : SIID_UNDEFINED;
00045 switch(type)
00046 {
00047 case mtError:
00048 fprintf(stderr, "ERROR: %s - %s\n", BSAPIInterfaceId2Text(iid), pMessage);
00049 break;
00050 case mtWarning:
00051 fprintf(stderr, "WARNING: %s - %s\n", BSAPIInterfaceId2Text(iid), pMessage);
00052 break;
00053 case mtLog:
00054 LogMessage(pMessage);
00055 break;
00056 }
00057 }
00058
00059 void LogMessage(const char *pMessage, ...)
00060 {
00061 if (mVerbose)
00062 {
00063 va_list ap;
00064 va_start(ap, pMessage);
00065 vfprintf(stderr, pMessage, ap);
00066 fprintf(stderr, "\n");
00067 va_end(ap);
00068 }
00069 }
00070
00071 void SetVerbose(bool verbose) {mVerbose = verbose;}
00072
00073 protected:
00074 bool mVerbose;
00075 } gErrorHandler;
00076
00077 void help()
00078 {
00079 puts("\n Voice-print extractor ");
00080 printf(" %s\n", BSAPIVersion());
00081 puts(" ================================================================ ");
00082 puts(" ");
00083 puts(" USAGE: vpextract [options] ");
00084 puts(" ");
00085 puts(" system configuration: ");
00086 puts(" -c file configuration file ");
00087 puts(" -v verbose mode ");
00088 puts(" ");
00089 puts(" input: ");
00090 puts(" -i file input file ");
00091 puts(" -l file list of input files ");
00092 puts(" -d dir input directory ");
00093 puts(" -e str [raw] extension of input files ");
00094 puts(" -s str [waveform] input data type (waveform, features) ");
00095 puts(" -k file use calibration file ");
00096 puts(" ");
00097 puts(" input data type = waveform: ");
00098 puts(" -w fmt [lin16] waveform format (lin16, lin8, alaw, mulaw) ");
00099 puts(" -n num [1] number of channels in audio files ");
00100 puts(" -p start,len active waveform part (in seconds) ");
00101 puts(" ");
00102 puts(" input data type = features: ");
00103 puts(" -f fmt [htk] feature format (ascii, binary, htk) ");
00104 puts(" ");
00105 puts(" output: ");
00106 puts(" -o file output file ");
00107 puts(" -h dir output directory ");
00108 puts(" -x str [vp] extension of voice-print files ");
00109 puts(" -y file make calibration file instead of voice-prints");
00110 puts(" ");
00111 puts(" diarization options: ");
00112 puts(" -t num total number of speakers ");
00113 puts(" -m num [6] maximal number of speakers ");
00114 puts(" -a num [0.75] constant sets the number of speakers ");
00115 puts(" ");
00116 }
00117
00118 int main(int argc, char *argv[])
00119 {
00120
00121 const char *pconfig_file = "settings/extract.bs";
00122 const char *pinput_file = 0;
00123 const char *plist_file = 0;
00124 const char *pinput_dir = 0;
00125 const char *pin_data_type = SID_DEF_IN_DATA_TYPE;
00126 const char *pwave_fmt = SID_DEF_WAVE_FMT;
00127 const char *pwave_ext = SID_DEF_WAVE_EXT;
00128 const char *pfea_fmt = SID_DEF_FEA_FMT;
00129 const char *poutput_file = 0;
00130 const char *poutput_dir = 0;
00131 const char *pvprint_ext = SID_DEF_VPRINT_EXT;
00132 const char *pin_calib_file = 0;
00133 const char *pout_calib_file = 0;
00134 int nchannels = SID_DEF_NCHANNELS;
00135 int total_speakers = 0;
00136 int max_speakers = 0;
00137 float acwf_start = 0.0f;
00138 float acwf_len = 0.0f;
00139 float max_avg_distance = 0.0f;
00140
00141
00142 if(argc == 1)
00143 {
00144 help();
00145 return 0;
00146 }
00147
00148 optind = 0;
00149 while (1)
00150 {
00151 int c = getopt(argc, argv, const_cast<char *>("c:i:l:d:s:e:w:n:p:f:o:h:x:k:y:t:m:a:v"));
00152 if(c == -1)
00153 break;
00154
00155 switch(c)
00156 {
00157 case 'c':
00158 pconfig_file = optarg;
00159 break;
00160
00161 case 'i':
00162 pinput_file = optarg;
00163 break;
00164 case 'l':
00165 plist_file = optarg;
00166 break;
00167 case 'd':
00168 pinput_dir = optarg;
00169 break;
00170 case 's':
00171 pin_data_type = optarg;
00172 break;
00173 case 'w':
00174 pwave_fmt = optarg;
00175 break;
00176 case 'e':
00177 pwave_ext = optarg;
00178 break;
00179 case 'n':
00180 if(sscanf(optarg, "%d", &nchannels) != 1 || nchannels < 1)
00181 {
00182 fprintf(stderr, "ERROR: Invalid number of channels: %s.\n", optarg);
00183 return 1;
00184 }
00185 break;
00186 case 'p':
00187 if(sscanf(optarg, "%f,%f", &acwf_start, &acwf_len) != 2)
00188 {
00189 fprintf(stderr, "ERROR: Can not parse active waveform part '%s'\n", optarg);
00190 return 1;
00191 }
00192 break;
00193 case 'f':
00194 pfea_fmt = optarg;
00195 break;
00196 case 'o':
00197 poutput_file = optarg;
00198 break;
00199 case 'h':
00200 poutput_dir = optarg;
00201 break;
00202 case 'v':
00203 gErrorHandler.SetVerbose(true);
00204 break;
00205 case 'x':
00206 pvprint_ext = optarg;
00207 break;
00208 case 'k':
00209 pin_calib_file = optarg;
00210 break;
00211 case 'y':
00212 pout_calib_file = optarg;
00213 break;
00214 case 't':
00215 if(sscanf(optarg, "%d", &total_speakers) != 1 || total_speakers < 0)
00216 {
00217 fprintf(stderr, "ERROR: Invalid total number of speakers '%s'. "
00218 "Must be positive integer or 0.\n", optarg);
00219 return 1;
00220 }
00221 break;
00222 case 'm':
00223 if(sscanf(optarg, "%d", &max_speakers) != 1 || max_speakers < 0)
00224 {
00225 fprintf(stderr, "ERROR: Invalid maximal number of speakers '%s'. "
00226 "Must be positive integer or 0.\n", optarg);
00227 return 1;
00228 }
00229 break;
00230 case 'a':
00231 if(sscanf(optarg, "%f", &max_avg_distance) != 1 || max_avg_distance < 0)
00232 {
00233 fprintf(stderr, "ERROR: Invalid constant sets the number of speakers '%s'. "
00234 "Must be positive floating point number or 0.\n", optarg);
00235 return 1;
00236 }
00237 break;
00238 case '?':
00239 fprintf(stderr, "ERROR: Command line parsing error.\n");
00240 return 1;
00241 default :
00242 fprintf(stderr, "ERROR: Command line parsing error. Unexpected argument '%s'.\n", optarg);
00243 return 1;
00244 }
00245 }
00246
00247
00248 SLicenseManagerI *plicman = BSAPIGetLicenseManager();
00249 if (plicman)
00250 {
00251 plicman->SetErrorHandler(&gErrorHandler);
00252 plicman->RegisterLicenseFile("license.dat");
00253 }
00254
00255
00256 SVoicePrintExtractorI *pvp_extract = static_cast<SVoicePrintExtractorI *>(BSAPICreateInstance(SIID_VPRINTEXTRACTOR));
00257 if(!pvp_extract)
00258 {
00259 return 1;
00260 }
00261
00262
00263 pvp_extract->SetErrorHandler(&gErrorHandler);
00264
00265
00266 char pdefault_cfg[1024];
00267 sprintf(pdefault_cfg, "settings%sconfig", DIRSEP);
00268 if(!pvp_extract->Init((pconfig_file ? pconfig_file : pdefault_cfg)))
00269 {
00270 pvp_extract->Release();
00271 return 1;
00272 }
00273
00274 SBlockSetI *pbset = pvp_extract->GetBlockSet();
00275 if(!pbset)
00276 {
00277 pvp_extract->Release();
00278 return 1;
00279 }
00280
00281 SWaveformFormatConvertorI *pwc = static_cast<SWaveformFormatConvertorI *>(pbset->GetBlock("waveform_convertor"));
00282 if(!pwc)
00283 {
00284 pvp_extract->Release();
00285 return 1;
00286 }
00287
00288 pwc->SetNChannels(nchannels);
00289 if (!pwc->SetInputFormatStr(pwave_fmt))
00290 {
00291 pvp_extract->Release();
00292 return 1;
00293 }
00294
00295 if(pbset->Exists("feature_source"))
00296 {
00297 SFeatureSourceI *pfeature_source = static_cast<SFeatureSourceI *>(pbset->GetBlock("feature_source"));
00298 if(!pfeature_source)
00299 {
00300 pvp_extract->Release();
00301 return 1;
00302 }
00303
00304 if (!pfeature_source->SetFileFormatStr(pfea_fmt))
00305 {
00306 pvp_extract->Release();
00307 return 1;
00308 }
00309 }
00310
00311
00312 if(acwf_len > 0.0f)
00313 {
00314 SActiveWaveformPartSelectorI *pacwf_sel = static_cast<SActiveWaveformPartSelectorI *>(pbset->GetBlock("param.waveform_part_selector"));
00315 if(!pacwf_sel)
00316 {
00317 pvp_extract->Release();
00318 return 1;
00319 }
00320
00321 pacwf_sel->SetSelectionStartSec(acwf_start);
00322 pacwf_sel->SetSelectionLengthSec(acwf_len);
00323 }
00324
00325 if(pbset->Exists("diar.quantizer"))
00326 {
00327 SVectorQuantizerI *pquantizer = static_cast<SVectorQuantizerI *>(pbset->GetBlock("diar.quantizer"));
00328 if(!pquantizer)
00329 {
00330 pvp_extract->Release();
00331 return 1;
00332 }
00333
00334 if(total_speakers) pquantizer->SetTotalClusters(total_speakers);
00335 if(max_avg_distance) pquantizer->SetMaxAvgDistance(max_avg_distance);
00336 if(max_speakers) pquantizer->SetMaxClusters(max_speakers);
00337 }
00338 else
00339 {
00340 if(total_speakers != 0 || max_speakers != 0 || max_avg_distance != 0.0f)
00341 fprintf(stderr, "WARNING: Diarization options were not applied. Config file '%s' does not support diarization. \n", (pconfig_file ? pconfig_file : pdefault_cfg));
00342 }
00343
00344
00345 SUserCalibrationI *pcalib = 0;
00346 if(pin_calib_file || pout_calib_file)
00347 {
00348 pcalib = static_cast<SUserCalibrationI *>(pbset->GetBlock("user_calibration"));
00349 if(!pcalib)
00350 {
00351 pvp_extract->Release();
00352 return 1;
00353 }
00354
00355 if(pout_calib_file)
00356 {
00357 pcalib->StartEstimation();
00358 }
00359 else
00360 {
00361 pcalib->SetEnabled(true);
00362 if(!pcalib->Load(pin_calib_file))
00363 {
00364 pvp_extract->Release();
00365 return 1;
00366 }
00367 }
00368 }
00369
00370 if(!pvp_extract->SetInputDataTypeStr(pin_data_type))
00371 {
00372 pvp_extract->Release();
00373 return 1;
00374 }
00375
00376
00377 if(pinput_file && !pvp_extract->ProcessFile(pinput_file, poutput_file))
00378 {
00379 pvp_extract->Release();
00380 return 1;
00381 }
00382
00383
00384 pvp_extract->SetOutputDirectory(poutput_dir);
00385 pvp_extract->SetWaveformFileSuffix(pwave_ext);
00386 pvp_extract->SetVoicePrintFileSuffix(pvprint_ext);
00387
00388
00389
00390 if(plist_file && !pvp_extract->ProcessList(plist_file))
00391 {
00392 pvp_extract->Release();
00393 return 1;
00394 }
00395
00396 if(pinput_dir && !pvp_extract->ProcessDirectory(pinput_dir))
00397 {
00398 pvp_extract->Release();
00399 return 1;
00400 }
00401
00402
00403 if(pout_calib_file)
00404 {
00405 assert(pcalib);
00406 pcalib->EndEstimation();
00407 if (!pcalib->Save(pout_calib_file))
00408 {
00409 pvp_extract->Release();
00410 return 1;
00411 }
00412 }
00413
00414 pvp_extract->Release();
00415 return 0;
00416 }