Vocal tract [WIP]

Ditty implementation of a kelly-lochbaum vocal tract, a-la dood.al/pinktrombone/

todo:
- use the same tongue positioning as pink trombone
- nasal tract
- phonemes
- (maybe) make a similar enough sounding formant synthesizer

Log in to post a comment.

// actual bad apple is a bit faster (138bpm), but this should help make it a bit easier to work with
ditty.bpm = 120;

// manual controls
input.forward = 0.5; // min=0, max=1, step=0.01
input.up = 0.5; // min=0, max=1, step=0.01
input.mouth = 0.5; // min=0, max=1, step=0.01
input.nose = 0.5; // min=0, max=1, step=0.01

// === vocal tract ===

function softsaw(p, formant) {
    const x = p-~~p;
    // polyblep
    const f = Math.abs(formant);
    const s = Math.max(0, Math.abs((f + 1) * (2*x - 1)) - f);
    // saw - polyblep
    return 2 * x - 1 - s*s * Math.sign(x - 0.5);
}

// air
const SPEED_OF_SOUND = 343; // roughly 343 m/s at 20 deg C

// vocal tract
const TRACT_LENGTH = 0.17; // 22 cm
const NOSE_LENGTH = 0.14; // 14 cm
const NOSE_START = 0.17; // 3 cm

const LIP_START = 0.15;
const TONGUE_START = 0.02;

// reflections
const GLOT_REFLECT = 0.75;
const LIP_REFLECT = -0.85;

// single tract
class Tract {
    constructor(len, nose, junction, lip, tongue) {
        // lengths, truncated
        len = len|0;
        nose = nose|0;
        junction = junction|0;
        lip = lip|0;
        tongue = tongue|0;
        
        // wave state
        this.tl = new Float32Array(len).fill(0);
        this.tr = new Float32Array(len).fill(0);
        this.twl = new Float32Array(len).fill(0);
        this.twr = new Float32Array(len).fill(0);
        
        // areas
        this.tarea = new Float32Array(len).fill(1);
        
        // reflection coefficients
        this.trefl = new Float32Array(len).fill(0);
        
        // junction state
        this.junction = junction;
        
        // nose state
        this.nl = new Float32Array(len).fill(0);
        this.nr = new Float32Array(len).fill(0);
        this.nwl = new Float32Array(len).fill(0);
        this.nwr = new Float32Array(len).fill(0);
        
        // areas
        this.narea = new Float32Array(len).fill(1);
        
        // reflection coefficients
        this.nrefl = new Float32Array(len).fill(0);
        
        // articulation
        this.lip = lip;
        this.tongue = tongue;
    }
    
    // calculate reflection coefficients
    updateReflection() {
        // tract
        for (let i = 1; i < this.trefl.length; i++) {
            const sum = this.tarea[i - 1] + this.tarea[i];
            this.trefl[i] = Math.abs(sum) > 1e-6 ? (this.tarea[i - 1] - this.tarea[i]) / sum : 1;
        }
        
        // nose
        for (let i = 1; i < this.nrefl.length; i++) {
            const sum = this.narea[i - 1] + this.narea[i];
            this.nrefl[i] = Math.abs(sum) > 1e-6 ? (this.narea[i - 1] - this.narea[i]) / sum : 1;
        }
        
        // nose junction
    }
    
    // propagate wave
    propagate(v) {
        // tract
        this.twr[0] = this.tl[0] * GLOT_REFLECT + v; // glottal reflection
        this.twl[this.twl.length - 1] = this.tr[this.tr.length - 1] * LIP_REFLECT; // lip reflection
        
        for (let i = 1; i < this.tl.length; i++) {
            const w = this.trefl[i] * (this.tr[i - 1] + this.tl[i]);
            this.twr[i] = this.tr[i - 1] - w;
            this.twl[i - 1] = this.tl[i] + w;
        }
        
        // propagate wave
        for (let i = 0; i < this.tl.length; i++) {
            this.tl[i] = this.twl[i] * 0.999;
            this.tr[i] = this.twr[i] * 0.999;
        }
        
        // nose
    }
    
    // set the shape of the tract
    shape(forward, up, nose, mouth) {
        // nose, static shape
        for (let i = 0; i < this.narea.length; i++) {}
        
        // mouth
        up += 2;
        for (let i = 0; i < this.tarea.length; i++) {
            if (i >= this.tongue && i < this.lip) {
                // where in the mouth we are
                const t = 1.1 * Math.PI * (forward * this.tarea.length - i) / (this.lip - this.tongue);
                const d = 2 + (up - 2) / 1.5;
                
                this.tarea[i] = Math.pow(1.5 - (1.5 - d + 1.7) * Math.cos(t), 2);
            } else if (i >= this.lip) {
                // where in the lip we are
                const p = (i - this.lip) / (this.tarea.length - this.lip);
                this.tarea[i] = Math.pow(1.5 - mouth * mouth * 4 * p * (1 - p), 2);
            } else this.tarea[i] = 0.6 * 0.6; // default area
        }
    }
    
    // get sound out
    tick(v) {
        this.updateReflection();
        this.propagate(v);
        
        // return the out waves
        return [this.tr[this.tr.length - 1], 0];
    }
}

const voc = synth.def(class {
    constructor(options) {
        // formant phase
        this.select = 0;
        
        // exciter phase
        this.phase = 0;
        
        // vocal tract
        this.tract = new Tract(
            ditty.sampleRate * TRACT_LENGTH / SPEED_OF_SOUND,
            ditty.sampleRate * NOSE_LENGTH / SPEED_OF_SOUND,
            ditty.sampleRate * NOSE_START / SPEED_OF_SOUND,
            ditty.sampleRate * LIP_START / SPEED_OF_SOUND,
            ditty.sampleRate * TONGUE_START / SPEED_OF_SOUND
        );
        //this.tract.shape(Math.random(), Math.random(), Math.random(), Math.random());
    }
    
    // select a mouth shape
    selectShape() {
        // how much to blend
        const fblend = this.select - this.select|0;
        
        // first index
        const fa = options.voice[0 + this.select|0];
        
        // second index
        const fb = options.voice[1 + this.select|0] || fa;
        
        return {
            
        };
    }
    
    process(note, env, tick, options) {
        // phase bookkeeping
        const dt = midi_to_hz(note) * ditty.dt;
        this.phase += dt;
        this.phase -= this.phase|0;
        
        // formant bookkeeping
        const fdt = (options.voice.length - 1) * ditty.dt / tick_to_second(options.duration);
        this.select += fdt;
        
        
        // update diameters
        // TODO
        
        // tick tracts
        // see https://dood.al/pinktrombone/
        // TODO
        
        // turbulence?
        // TODO
        
        // glottal air wave
        // technically not entirely correct like this, but is a lot easier to do
        const glottal = env.value * softsaw(this.phase, 4);
        this.tract.shape(input.forward, input.up, input.nose, input.mouth);
        const [mouth, nose] = this.tract.tick(glottal);
        
        // logging
        debug.log("Frequency", midi_to_hz(note));
        debug.probe("Glottal flow", glottal, 1, 4 / midi_to_hz(note));
        debug.probe("Lips out", mouth, 2, 6 / midi_to_hz(note));
        debug.probe("Nose out", nose, 2, 6 / midi_to_hz(note));
        
        // out!
        return mouth * 0.5;
    }
}, { attack: 0.05, release: 0.05 });

// === melody ===

// sing the vocals
// similar to patmerge, but also allows providing vocals
function sing(instr, pat, time, vocals) {
    const notes = [];
    for (const note of pat) {
        if (notes.length && notes[notes.length - 2] == note) notes[notes.length - 1] += time;
        else notes.push(note, time);
    }
    
    for (let i = 0; i < notes.length / 2; i++) {
        if (notes[i * 2] > 0) instr.play(notes[i * 2], { duration: notes[i * 2 + 1], voice: vocals[i] });
        sleep(notes[i * 2 + 1]);
    }
}

// vocals!
// first verse from bad apple

// vocals, verse
// 64 beats
function vocalsVerse(vc) {
    // + lyrics, the romanization is slightly shuffled around to help me fit it to the notes
    // nagareteku toki
    sing(vc, [ds4, f4, fs4, gs4, as4, as4, ds5, cs5],
        1/2, ["na", "ga", "re", "te", "ku", "to", "ki"]);
    // no na ka de demo
    sing(vc, [as4, as4, ds4, ds4, as4, gs4, fs4, f4],
        1/2, ["no", "na", "ka", "de", "de", "mo"]);
    // kedarusa ga hora
    sing(vc, [ds4, f4, fs4, gs4, as4, as4, gs4, fs4], 
        1/2, ["ke", "da", "ru", "sa", "ga", "ho", "ra"]);
    // guruguru mawa ate
    sing(vc, [f4, ds4, f4, fs4, f4, ds4, d4, f4], 
        1/2, ["gu", "ru", "gu", "ru", "ma", "wa", "a", "te"]);
    
    // watashi ka ra hana
    sing(vc, [ds4, f4, fs4, gs4, as4, as4, ds5, cs5], 
        1/2, ["wa", "ta", "shi", "ka", "ra", "ha", "na"]);
    // re ru kokoro mo
    sing(vc, [as4, as4, ds4, ds4, as4, gs4, fs4, f4], 
        1/2, ["re", "ru", "ko", "ko", "ro", "mo"]);
    // mi en ai wa so o 
    sing(vc, [ds4, f4, fs4, gs4, as4, as4, gs4, fs4], 
        1/2, ["mi", "en", "a", "i", "wa", "so", "o"]);
    
    // shi ra na i?
    sing(vc, [f4, fs4, gs4, as4], 
        1/1, ["shi", "ra", "na", "i"]);
    
    // jibu un ka ra -- u go
    sing(vc, [ds4, f4, fs4, gs4, as4, as4, ds5, cs5], 
        1/2, ["ji", "bu", "un", "ka", "ra", "u", "go"]);
    // ku ko to mo naku
    sing(vc, [as4, as4, ds4, ds4, as4, gs4, fs4, f4], 
        1/2, ["ku", "ko", "to", "mo", "na", "ku"]);
    // toki no suki ma ni
    sing(vc, [ds4, f4, fs4, gs4, as4, as4, gs4, fs4],
        1/2, ["to", "ki", "no", "su", "ki", "ma", "ni"]);
    // nagasare tsuzukete
    sing(vc, [f4, ds4, f4, fs4, f4, ds4, d4, f4], 
        1/2, ["na", "ga", "sa", "re", "tsu", "zu", "ke", "te"]);
    
    // Shirana i wa mawa
    sing(vc, [ds4, f4, fs4, gs4, as4, as4, ds5, cs5], 
        1/2, ["shi", "ra", "na", "i", "wa", "ma", "wa"]);
    // ri no koto nado
    sing(vc, [as4, as4, ds4, ds4, as4, gs4, fs4, f4], 
        1/2, ["ri", "no", "ko", "to", "na", "do"]);
    // watashi wa watashi
    sing(vc, [ds4, f4, fs4, gs4, as4, as4, gs4, fs4], 
        1/2, ["wa", "ta", "shi", "wa", "wa", "ta", "shi"]);
    
    // sore dake
    sing(vc, [f4, fs4, gs4, as4], 
        1/1, ["so", "re", "da", "ke"]);
}


// Main loop, connected to the filter we just created
loop(() => {
    // voice articulation demo
    sing(voc, [ds4, f4, gs4, as4], 6, ["sus", "amo", "gus", "sus"]);
    
    // sing bad apple
    vocalsVerse(voc);
}, { name: 'Vocals' });