import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
def process_child_speech(file_path):
\# 1. Load the audio file
\# We use sr=16000 because most speech models expect 16kHz
audio, sample_rate = librosa.load(file_path, sr=16000)
\# 2. Extract MFCCs
\# These represent the 'phones' (sounds) the child is making
mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=13)
\# 3. Calculate the mean (average) of the MFCCs
\# This gives us a single 'summary' vector for the whole clip
mfccs_processed = np.mean(mfccs.T, axis=0)
return mfccs, mfccs_processed
# — Visualization (To see what the AI ‘sees’) —
def plot_speech_features(mfccs):
plt.figure(figsize=(10, 4))
librosa.display.specshow(mfccs, x_axis='time')
plt.colorbar()
plt.title('MFCC: The Sound Fingerprint')
plt.tight_layout()
plt.show()
# Example Usage:
# replace ‘child_voice.wav’ with a file from your training data
# mfccs, processed_data = process_child_speech(‘child_voice.wav’)
# plot_speech_features(mfccs)