From 6f1e4cb34808fece84e12d099be75dc13aad835b Mon Sep 17 00:00:00 2001 From: Schmidtke Date: Sun, 12 Jan 2025 13:11:28 +0100 Subject: [PATCH 1/2] corrects mistake in readme, makes the index nicer, adds a training page that is still work in progress --- README.rst | 5 ++++- docs/source/index.rst | 4 ++++ docs/source/training.rst | 28 ++++++++++++++++++++++++++++ 3 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 docs/source/training.rst diff --git a/README.rst b/README.rst index 46ccbc2..dc9b7da 100644 --- a/README.rst +++ b/README.rst @@ -4,7 +4,7 @@ PAULE .. image:: https://zenodo.org/badge/355606517.svg :target: https://zenodo.org/badge/latestdoi/355606517 -Predictive Articulatory speech synthesis Utilizing Lexical Embeddings (PAULE) a +Predictive Articulatory speech synthesis Utilizing Lexical Embeddings (PAULE) is a python frame work to plan control parameter trajectories for the VocalTractLab simulator for a target acoustics or semantic embedding. @@ -54,6 +54,9 @@ To cite the PAULE source code use the DOI 10.5281/zenodo.7252431 (https://zenodo.org/doi/10.5281/zenodo.7252431), if you want to cite the software in general or the specific DOI on Zenodo. +Linked Projects +--------------- +To generate suitable corpus data for the PAULE model, we have also implemented `create_vtl_corpus `_ Acknowledgements ---------------- diff --git a/docs/source/index.rst b/docs/source/index.rst index 9186f23..a278186 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -3,6 +3,10 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. + +PAULE +====== +Predictive Articulatory speech synthesis Utilizing Lexical Embeddings (PAULE) is a python frame work to plan control parameter trajectories for the VocalTractLab simulator for a target acoustics or semantic embedding. .. toctree:: :maxdepth: 2 :caption: Contents: diff --git a/docs/source/training.rst b/docs/source/training.rst new file mode 100644 index 0000000..558c66b --- /dev/null +++ b/docs/source/training.rst @@ -0,0 +1,28 @@ +Training +========== + + + +Data +===== + +Paule requires data to be in the following format: + + + + +Training +======== +For effective training you probably want to use a GPU. + + +Paule consist of a number of models that are trained seperately. The models are: + +- `Embedder` Input: Log mel spectrogram, Output: Semantic embedding, is added to the target embedding + +- `ForwardModel` Input: Normalized control parameters, Output: Log mel spectrogram + +- `InverseModel` Input: Log mel spectrogram, Output: Normalized control parameters + + + From 5b797956113427c35950ba8628dbe30917f8b26d Mon Sep 17 00:00:00 2001 From: Schmidtke Date: Thu, 27 Mar 2025 16:29:39 +0100 Subject: [PATCH 2/2] changes dropout to 0.7 --- docs/source/training.rst | 2 ++ paule/paule.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/source/training.rst b/docs/source/training.rst index 558c66b..dbb7f35 100644 --- a/docs/source/training.rst +++ b/docs/source/training.rst @@ -24,5 +24,7 @@ Paule consist of a number of models that are trained seperately. The models are: - `InverseModel` Input: Log mel spectrogram, Output: Normalized control parameters +-`MelGANs` + diff --git a/paule/paule.py b/paule/paule.py index f6bcbc4..63c27d8 100644 --- a/paule/paule.py +++ b/paule/paule.py @@ -162,7 +162,7 @@ def __init__(self, *, pred_model=None, pred_optimizer=None, inv_model=None, inv_ if embedder: self.embedder = embedder else: - self.embedder = EmbeddingModel(num_lstm_layers=2, hidden_size=720).double() + self.embedder = EmbeddingModel(num_lstm_layers=2, hidden_size=720, dropout=0.7).double() self.embedder.load_state_dict(torch.load( os.path.join(DIR, "pretrained_models/embedder/embed_model_common_voice_syn_rec_2_720_0_dropout_07_noise_6e05_rmse_lr_00001_200.pt"), map_location=self.device))