00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #include <stdio.h>
00011 #include <stdlib.h>
00012 #include <stdarg.h>
00013 #include <string.h>
00014 #include <errno.h>
00015 #include <math.h>
00016
00017 #include "bsapi.h"
00018 #include "getopt.h"
00019
00020 #ifdef WIN32
00021 #define DIRSEP "\\"
00022 #else
00023 #define DIRSEP "/"
00024 #endif
00025
00026
00027 #define SID_MIN_LEN_TO_PROCESS 10.0f // 10 seconds
00028 #define SID_BELOW_MIN_LEN_TEXT "(too short)"
00029 #define SID_OUT_COLUMN_CHARS "snlr"
00030
00031
00032 #define SID_DEF_WAVE_FMT "alaw"
00033 #define SID_DEF_WAVE_EXT "alw"
00034 #define SID_DEF_NCHANNELS 1
00035 #define SID_DEF_SCORE_SHARPNESS 0.3f
00036
00037
00038
00039
00040 class ErrorHandler : public SErrorCallbackI
00041 {
00042 public:
00043 ErrorHandler() : mVerbose(false) {;}
00044 virtual void BSAPI_METHOD OnTextMessage(unsigned int iId, message_type type, unsigned int messageId, char *pMessage)
00045 {
00046 switch(type)
00047 {
00048 case mtError:
00049 fprintf(stderr, "ERROR: %s - %s\n", BSAPIInterfaceId2Text(iId), pMessage);
00050 break;
00051 case mtWarning:
00052 fprintf(stderr, "WARNING: %s - %s\n", BSAPIInterfaceId2Text(iId), pMessage);
00053 break;
00054 case mtLog:
00055 LogMessage(pMessage);
00056 break;
00057 }
00058 }
00059
00060 void LogMessage(const char *pMessage, ...)
00061 {
00062 if (mVerbose)
00063 {
00064 va_list ap;
00065 va_start(ap, pMessage);
00066 vfprintf(stderr, pMessage, ap);
00067 fprintf(stderr, "\n");
00068 va_end(ap);
00069 }
00070 }
00071
00072 void SetVerbose(bool verbose) {mVerbose = verbose;}
00073
00074 protected:
00075 bool mVerbose;
00076 } gErrorHandler;
00077
00078 void help()
00079 {
00080 puts("\n Speaker identification ");
00081 printf(" %s\n", BSAPIVersion());
00082 puts(" ================================================================ ");
00083 puts(" ");
00084 puts(" USAGE: sid [options] ");
00085 puts(" ");
00086 puts(" system configuration: ");
00087 puts(" -c file configuration file ");
00088 puts(" -m dir model directory ");
00089 puts(" -a str1,str2... active speaker models ");
00090 puts(" -v verbose mode ");
00091 puts(" ");
00092 puts(" input: ");
00093 puts(" -i file input file ");
00094 puts(" -l file list of input files ");
00095 puts(" -d dir input directory ");
00096 puts(" -e str [alw] extension of audio files ");
00097 puts(" -w fmt [alaw] waveform format (lin16, lin8, alaw, mulaw) ");
00098 puts(" -n num [1] number of channels in audio files ");
00099 puts(" -p start,len active waveform part (in seconds) ");
00100 puts(" ");
00101 puts(" training: ");
00102 puts(" -t enable training ");
00103 puts(" -g str name of the speaker to be trained ");
00104 puts(" ");
00105 puts(" output: ");
00106 puts(" -s file output score file ");
00107 puts(" -r produce scores for all speakers ");
00108 puts(" -f columns enable column output format (see columns) ");
00109 puts(" -z num [0.3] score sharpness (positive number) ");
00110 puts(" ");
00111 puts(" columns: (columns to print are specified by string of ");
00112 puts(" the characters below, e.g. lsn) ");
00113 puts(" s raw score ");
00114 puts(" n score normalized to <0, 100> ");
00115 puts(" l speech length ");
00116 puts(" r record length ");
00117 puts(" ");
00118 }
00119
00120 inline float RescaleScore(float score, float scale = 1.0f)
00121 {
00122 return (100.0f / (1.0f + expf(-(scale * score))));
00123 }
00124
00125
00126 void DumpColWise(const char *pInputName, char **ppNames, float *pScores, int nSpeakers,
00127 float recordLength, float speechLength, float sharpness, const char *pColumnFmt, FILE *pFileHandle)
00128 {
00129 for (int i = 0; i < nSpeakers; i++)
00130 {
00131 fprintf(pFileHandle, "%s %s", pInputName, ppNames[i]);
00132 for (const char *pc = pColumnFmt; *pc != '\0'; pc++)
00133 {
00134 switch (*pc)
00135 {
00136 case 's':
00137 if (speechLength < SID_MIN_LEN_TO_PROCESS)
00138 fprintf(pFileHandle, " -inf");
00139 else
00140 fprintf(pFileHandle, " %.3f", pScores[i]);
00141 break;
00142
00143 case 'n':
00144 if (speechLength < SID_MIN_LEN_TO_PROCESS)
00145 fprintf(pFileHandle, " 0.000");
00146 else
00147 fprintf(pFileHandle, " %.3f", RescaleScore(pScores[i], sharpness));
00148 break;
00149
00150 case 'l':
00151 fprintf(pFileHandle, " %.3f", speechLength);
00152 break;
00153
00154 case 'r':
00155 fprintf(pFileHandle, " %.3f", recordLength);
00156 break;
00157 }
00158 }
00159
00160 if (speechLength < SID_MIN_LEN_TO_PROCESS)
00161 fprintf(pFileHandle, " %s", SID_BELOW_MIN_LEN_TEXT);
00162
00163 fprintf(pFileHandle, "\n");
00164 }
00165 }
00166
00167
00168
00169 void DumpRowWise(const char *pInputName, char **ppNames, float *pScores, int nSpeakers,
00170 float speechLength, float sharpness, FILE *pFileHandle)
00171 {
00172 static bool first_time = true;
00173
00174 if (first_time)
00175 {
00176 if (nSpeakers > 0)
00177 fprintf(pFileHandle, "%s", ppNames[0]);
00178 for (int i = 1; i < nSpeakers; i++)
00179 fprintf(pFileHandle, " %s", ppNames[i]);
00180 fprintf(pFileHandle, "\n");
00181 first_time = false;
00182 }
00183
00184 fprintf(pFileHandle, "%s", pInputName);
00185 for (int i = 0; i < nSpeakers; i++)
00186 {
00187 if (speechLength < SID_MIN_LEN_TO_PROCESS)
00188 fprintf(pFileHandle, " 0.000");
00189 else
00190 fprintf(pFileHandle, " %.3f", RescaleScore(pScores[i], sharpness));
00191 }
00192
00193 if (speechLength < SID_MIN_LEN_TO_PROCESS)
00194 fprintf(pFileHandle, " %s", SID_BELOW_MIN_LEN_TEXT);
00195
00196 fprintf(pFileHandle, "\n");
00197 }
00198
00199
00200 bool DumpScore(const char *pInputName, SSpeakerIDI *psid, bool dumpAllScores,
00201 const char *pColumnFmt, float sharpness, FILE *pFileHandle)
00202 {
00203 SWaveformFormatConvertorI *pwfconv = psid->GetWaveformFormatConvertor();
00204 if (!pwfconv)
00205 return false;
00206
00207 float speech_length = psid->GetTestLength();
00208 float record_length = pwfconv->GetInputLength();
00209
00210 if (dumpAllScores)
00211 {
00212 int num = 0;
00213 char **ppnames = psid->GetModelNames(&num);
00214 float *pscores = psid->GetModelScores(&num);
00215
00216
00217 if (!ppnames || !pscores)
00218 return false;
00219
00220
00221 if (pColumnFmt)
00222 {
00223 DumpColWise(pInputName, ppnames + 1, pscores + 1, num - 1, record_length,
00224 speech_length, sharpness, pColumnFmt, pFileHandle);
00225 }
00226 else
00227 {
00228 DumpRowWise(pInputName, ppnames + 1, pscores + 1, num - 1, speech_length,
00229 sharpness, pFileHandle);
00230 }
00231 }
00232 else
00233 {
00234 float score = 0.0f;
00235 char *pname = psid->GetBestModel(&score);
00236
00237 if (!pname)
00238 return false;
00239
00240 DumpColWise(pInputName, &pname, &score, 1, record_length,
00241 speech_length, sharpness, (pColumnFmt ? pColumnFmt : "n"), pFileHandle);
00242 }
00243
00244 return true;
00245 }
00246
00247 int main(int argc, char *argv[])
00248 {
00249
00250 char *pconfig_file = 0;
00251 char *pmodel_dir = 0;
00252 char *pinput_file = 0;
00253 char *plist_file = 0;
00254 char *pinput_dir = 0;
00255 char *pwave_fmt = SID_DEF_WAVE_FMT;
00256 char *pwave_ext = SID_DEF_WAVE_EXT;
00257 char *poutput_file = 0;
00258 char *pcolumn_fmt = 0;
00259 char *pspeaker_name = 0;
00260 char *pactive_speakers = 0;
00261 int nchannels = SID_DEF_NCHANNELS;
00262 bool training_mode = false;
00263 bool dump_all_scores = false;
00264 float sharpness = SID_DEF_SCORE_SHARPNESS;
00265 float acwf_start = 0;
00266 float acwf_len = 0;
00267
00268
00269 if(argc == 1)
00270 {
00271 help();
00272 return 0;
00273 }
00274
00275 optind = 0;
00276 while (1)
00277 {
00278 int c = getopt(argc, argv, const_cast<char *>("c:m:i:l:d:e:n:w:s:a:o:trf:vg:p:z:"));
00279 if(c == -1)
00280 break;
00281
00282 switch(c)
00283 {
00284 case 'c':
00285 pconfig_file = optarg;
00286 break;
00287 case 'm':
00288 pmodel_dir = optarg;
00289 break;
00290 case 'i':
00291 pinput_file = optarg;
00292 break;
00293 case 'l':
00294 plist_file = optarg;
00295 break;
00296 case 'd':
00297 pinput_dir = optarg;
00298 break;
00299 case 'w':
00300 pwave_fmt = optarg;
00301 break;
00302 case 'e':
00303 pwave_ext = optarg;
00304 break;
00305 case 'n':
00306 if(sscanf(optarg, "%d", &nchannels) != 1 || nchannels < 1)
00307 {
00308 fprintf(stderr, "ERROR: Invalid number of channels: %s.\n", optarg);
00309 return 1;
00310 }
00311 break;
00312 case 's':
00313 poutput_file = optarg;
00314 break;
00315 case 'v':
00316 gErrorHandler.SetVerbose(true);
00317 break;
00318 case 't':
00319 training_mode = true;
00320 break;
00321 case 'r':
00322 dump_all_scores = true;
00323 break;
00324 case 'f':
00325 pcolumn_fmt = optarg;
00326 if (strspn(pcolumn_fmt, SID_OUT_COLUMN_CHARS) != strlen(pcolumn_fmt))
00327 {
00328 fprintf(stderr, "ERROR: Wrong format of output columns string. "
00329 "The set of allowed characters is '%s'.\n", SID_OUT_COLUMN_CHARS);
00330 return 1;
00331 }
00332 break;
00333 case 'z':
00334 if(sscanf(optarg, "%f", &sharpness) != 1)
00335 {
00336 fprintf(stderr, "ERROR: Wrong value of score sharpness '%s'. Must be positive number.\n", optarg);
00337 return 1;
00338 }
00339 break;
00340 case 'g':
00341 pspeaker_name = optarg;
00342 break;
00343 case 'a':
00344 pactive_speakers = optarg;
00345 break;
00346 case 'p':
00347 if(sscanf(optarg, "%f,%f", &acwf_start, &acwf_len) != 2)
00348 {
00349 fprintf(stderr, "ERROR: Can not parse active waveform part '%s'\n", optarg);
00350 return 1;
00351 }
00352 break;
00353 case '?':
00354 fprintf(stderr, "ERROR: Command line parsing error.\n");
00355 return 1;
00356 }
00357 }
00358
00359
00360 SLicenseManagerI *plicman = BSAPIGetLicenseManager();
00361 if (plicman)
00362 {
00363 plicman->SetErrorHandler(&gErrorHandler);
00364 plicman->RegisterLicenseFile("license.dat");
00365 }
00366
00367
00368 SSpeakerIDI *psid = static_cast<SSpeakerIDI *>(BSAPICreateInstance(SIID_SPKID));
00369 if(!psid)
00370 {
00371 return 1;
00372 }
00373
00374
00375 psid->SetErrorHandler(&gErrorHandler);
00376
00377
00378 char pdefault_cfg[1024];
00379 sprintf(pdefault_cfg, "settings%sconfig", DIRSEP);
00380 if(!psid->Init((pconfig_file ? pconfig_file : pdefault_cfg)))
00381 {
00382 psid->Release();
00383 return 1;
00384 }
00385
00386
00387
00388 if(pmodel_dir && !psid->SetModelDirectory(pmodel_dir))
00389 {
00390 psid->Release();
00391 return 1;
00392 }
00393
00394
00395
00396 SWaveformFormatConvertorI *pwc = psid->GetWaveformFormatConvertor();
00397 if (pwc)
00398 {
00399 pwc->SetNChannels(nchannels);
00400 pwc->SetInputFormatStr(pwave_fmt);
00401 }
00402
00403
00404
00405
00406
00407 if(training_mode)
00408 {
00409
00410 if((pinput_file || pinput_dir) && !pspeaker_name)
00411 {
00412 fprintf(stderr, "ERROR: Training using one file or directory without knowing speaker name. Please set -g speaker\n");
00413 psid->Release();
00414 return 1;
00415 }
00416
00417
00418
00419
00420 int nreq_iters = psid->GetNRequestedTrainingIters();
00421
00422 int i;
00423 for(i = 0; i < nreq_iters; i++)
00424 {
00425
00426
00427
00428 if(!psid->StartTrainingIteration())
00429 {
00430 psid->Release();
00431 return 1;
00432 }
00433
00434
00435 if(pinput_file && !psid->AddFile(pspeaker_name, pinput_file))
00436 {
00437 psid->Release();
00438 return 1;
00439 }
00440
00441
00442
00443 if(plist_file && !psid->AddFilesFromListFile(plist_file))
00444 {
00445 psid->Release();
00446 return 1;
00447 }
00448
00449
00450 if(pinput_dir && !psid->AddFilesFromDirectory(pspeaker_name, pinput_dir, pwave_ext))
00451 {
00452 psid->Release();
00453 return 1;
00454 }
00455 }
00456
00457
00458
00459
00460 if(!psid->FinishTraining())
00461 {
00462 psid->Release();
00463 return 1;
00464 }
00465
00466 if (pspeaker_name && (psid->GetTrainingLength(pspeaker_name) < SID_MIN_LEN_TO_PROCESS))
00467 {
00468 fprintf(stderr, "WARNING: Training record(s) contain only %.3f seconds of speech. "
00469 "At least %f seconds are needed to obtain significant results.\n",
00470 psid->GetTrainingLength(pspeaker_name), SID_MIN_LEN_TO_PROCESS);
00471 }
00472 }
00473 else
00474 {
00475
00476
00477
00478 if(!psid->SetActiveWaveformPart(acwf_start, acwf_len))
00479 {
00480 psid->Release();
00481 return 1;
00482 }
00483
00484
00485 if(pactive_speakers)
00486 psid->ActivateModels(pactive_speakers);
00487 else
00488 psid->ActivateAllModels();
00489
00490
00491
00492
00493
00494 FILE *pf_out = stdout;
00495 if(poutput_file)
00496 {
00497 pf_out = fopen(poutput_file, "w");
00498 if(!pf_out)
00499 {
00500 fprintf(stderr, "ERROR: Can not open output score file '%s'.", poutput_file);
00501 psid->Release();
00502 return 1;
00503 }
00504 }
00505
00506
00507 if(pinput_file)
00508 {
00509 gErrorHandler.LogMessage("Processing file: %s", pinput_file);
00510 if(!psid->TestFile(pinput_file))
00511 {
00512 psid->Release();
00513 return 1;
00514 }
00515 DumpScore(pinput_file, psid, dump_all_scores, pcolumn_fmt, sharpness, pf_out);
00516 }
00517
00518
00519 if(plist_file)
00520 {
00521 SFileListI *plist = static_cast<SFileListI *>(BSAPICreateInstance(SIID_FILELIST));
00522 if(!plist)
00523 {
00524 fprintf(stderr, "Memory allocation error.");
00525 psid->Release();
00526 return 1;
00527 }
00528 plist->SetErrorHandler(&gErrorHandler);
00529 if(!plist->AddList(plist_file))
00530 {
00531 plist->Release();
00532 psid->Release();
00533 return 1;
00534 }
00535
00536 plist->FirstLine();
00537 char ptarget[1024];
00538 char psource[1024];
00539 int start;
00540 int end;
00541 float prob;
00542 while(plist->GetLine(ptarget, psource, &start, &end, &prob))
00543 {
00544 gErrorHandler.LogMessage("Processing file: %s", psource);
00545 if(!psid->TestFile(psource))
00546 {
00547 plist->Release();
00548 psid->Release();
00549 return 1;
00550 }
00551 DumpScore(psource, psid, dump_all_scores, pcolumn_fmt, sharpness, pf_out);
00552 }
00553 plist->Release();
00554 }
00555
00556
00557
00558 if(pinput_dir)
00559 {
00560 SFileSnifferI *psniffer = static_cast<SFileSnifferI *>(BSAPICreateInstance(SIID_FILESNIFFER));
00561 if(!psniffer)
00562 {
00563 fprintf(stderr, "Memory allocation error.");
00564 psid->Release();
00565 return 1;
00566 }
00567 psniffer->SetErrorHandler(&gErrorHandler);
00568 psniffer->AddDirectory(pinput_dir);
00569 psniffer->AddWantedSuffix(pwave_ext);
00570
00571 if(!psniffer->FirstFile())
00572 {
00573 psniffer->Release();
00574 psid->Release();
00575 return 1;
00576 }
00577
00578 char psource[1024];
00579 while(psniffer->GetFile(psource, sizeof(psource) - 1))
00580 {
00581 gErrorHandler.LogMessage("Processing file: %s", psource);
00582 if(!psid->TestFile(psource))
00583 {
00584 psniffer->Release();
00585 psid->Release();
00586 return 1;
00587 }
00588 DumpScore(psource, psid, dump_all_scores, pcolumn_fmt, sharpness, pf_out);
00589 }
00590 psniffer->Release();
00591 }
00592
00593
00594 if(pf_out != stdout)
00595 fclose(pf_out);
00596 }
00597
00598
00599 psid->Release();
00600
00601 return 0;
00602 }