feat: upgrade pre-commit to use single stage

2024-05-18 12:27:18 +00:00 · 2024-05-18 12:27:18 +00:00 · 594cdd86d1
commit 594cdd86d1
parent f879765ed0
23 changed files with 167 additions and 107 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1 @@
-.apikeys.env
+.apikeys.env
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -12,7 +12,7 @@ stages:
  - build
  - lint
  - test
-    
+
 build:app:
  stage: build
  image: 'node:22.1.0-slim'
@ -30,7 +30,7 @@ lint:app:
  stage: lint
  image: 'node:22.1.0-slim'
  script:
-    - *pre_pnpm 
+    - *pre_pnpm
    - pnpm install --prefer-offline
    - pnpm lint
  cache:
@ -45,7 +45,7 @@ lint:app:
 test:app:
  stage: test
  image: 'node:22.1.0-slim'
-  script: 
+  script:
    - *pre_pnpm
    -  pnpm install --prefer-offline
    -  pnpm test:unit --coverage
@ -55,4 +55,3 @@ test:app:
        - frontend/pnpm-lock.yaml
    paths:
      - frontend/.pnpm-store
-
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,4 +1,10 @@
 repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v2.3.0
+    hooks:
+    -   id: check-yaml
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
  - repo: https://github.com/compilerla/conventional-pre-commit
    rev: v3.2.0
    hooks:
@ -10,15 +16,27 @@ repos:
      - id: ruff
        types_or: [python, pyi, jupyter]
        args: [--fix]
+        stages: [pre-commit]
      - id: ruff-format
        types_or: [python, pyi, jupyter]
+        stages: [pre-commit]
+  - repo: https://github.com/RobertCraigie/pyright-python
+    rev: v1.1.351
+    hooks:
+    - id: pyright
+      types_or: [python, pyi, jupyter]
+      additional_dependencies: [numpy, pytest, fastapi, praat-parselmouth, orjson, pydantic, scipy, psycopg, deepgram-sdk, pydub, ffmpeg-python]
+      stages: [pre-commit]
  - repo: https://github.com/crate-ci/typos
    rev: v1.21.0
    hooks:
      - id: typos
+        stages: [pre-commit]
  - repo: local
    hooks:
      - id: pnpm-lint
        name: pnpm lint
        language: system
-        entry: bash -c 'cd frontend; pnpm prettier --write . && pnpm eslint --fix .' 
+        stages: [pre-commit]
+        types_or: [ts, javascript, svelte]
+        entry: bash -c 'cd frontend; pnpm prettier --write . && pnpm eslint --fix .'
--- a/.variables.env
+++ b/.variables.env
@ -2,4 +2,4 @@ POSTGRES_DB= spectral_db
 POSTGRES_USER= user
 POSTGRES_PASSWORD= password
 POSTGRES_HOST= postgres
-POSTGRES_PORT= 5432
+POSTGRES_PORT= 5432
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -31,7 +31,7 @@ To use the conventional-pre-commit, which enforces [Conventional Commits](https:
 pre-commit install --hook-type commit-msg
 ```

-## Making code changes 
+## Making code changes

 Now make your changes. Make sure to include additional tests if necessary.

--- a/backend/.gitignore
+++ b/backend/.gitignore
@ -1,2 +1 @@
 **/__pycache__/**
-
--- a/backend/README.md
+++ b/backend/README.md
@ -16,4 +16,4 @@ To be able to use the deepgram model for the endpoint api/transcription/{model}/

 ```
 DG_KEY = <your_api_key>
-```
+```
--- a/backend/pyproject.toml
+++ b/backend/pyproject.toml
@ -3,7 +3,7 @@ name = "spectral"
 version = "0.0.1"
 description = "Backend for spectral: atypical speech analysis and recognition platform"
 authors = [
-		"Roman Knyazhitskiy <mail@knyaz.tech>", 
+		"Roman Knyazhitskiy <mail@knyaz.tech>",
 		"Ody Machairas",
 		"Quinton Den Haan",
 		"Thijs Houben",
--- a/backend/spectral/data_objects.py
+++ b/backend/spectral/data_objects.py
@ -1,21 +1,24 @@
 from pydantic import BaseModel
 from typing import Optional

+
 class Frame(BaseModel):
    """
    Frame model representing a frame of data with its sampling frequency.
-    
+
    Attributes:
        data (list): The data contained in the frame.
        fs (float): The sampling frequency of the data.
    """
+
    data: list
    fs: float

+
 class Signal(BaseModel):
    """
-    Signal model representing a signal which contains both various attributes related to its 
-    sampling frequency and values, and paramaters for calculating the pitches, spectogram and formants
+    Signal model representing a signal which contains both various attributes related to its
+    sampling frequency and values, and parameters for calculating the pitches, spectogram and formants

    Attributes:
        data (list): The data contained in the signal.
@ -27,6 +30,7 @@ class Signal(BaseModel):
        formants_time_step (Optional[float]): The time step for formants analysis. Defaults to None.
        formants_window_length (float): The window length for formants analysis. Defaults to 0.025 seconds.
    """
+
    data: list
    fs: float
    pitch_time_step: Optional[float] = None
@ -34,4 +38,4 @@ class Signal(BaseModel):
    spectogram_window_length: float = 0.005
    spectogram_frequency_step: float = 20.0
    formants_time_step: Optional[float] = None
-    formants_window_length: float = 0.025
+    formants_window_length: float = 0.025
--- a/backend/spectral/database.py
+++ b/backend/spectral/database.py
@ -1,6 +1,7 @@
-import psycopg 
+import psycopg
 import uuid

+
 class Database:
    """
    Database class for interacting with a PostgreSQL database.
@ -10,13 +11,14 @@ class Database:
    Attributes:
        conn (psycopg.Connection): The connection object to the database.
        cursor (psycopg.Cursor): The cursor object to execute database queries.
-    
+
    Methods:
        fetch_file(id: int) -> dict:
            Fetches a file record from the database by its ID.
        close():
            Closes the database connection and cursor.
    """
+
    def __init__(self, user, password, host, port, dbname):
        """
        Initializes the Database object and opens a connection to the specified PostgreSQL database.
@ -29,16 +31,12 @@ class Database:
            dbname (str): The name of the database.
        """
        self.conn = psycopg.connect(
-            dbname=dbname,
-            user=user,
-            password=password,
-            host=host,
-            port=port
+            dbname=dbname, user=user, password=password, host=host, port=port
        )
        print("database connection opened")
        self.cursor = self.conn.cursor()
-    
-    def fetch_file(self,id):
+
+    def fetch_file(self, id):
        """
        Fetches a file record from the database by its ID.

@ -48,10 +46,19 @@ class Database:
        Returns:
            dict: A dictionary containing the file record's details.
        """
-        self.cursor.execute("SELECT * FROM files WHERE id = %s",[id])
-        res = self.cursor.fetchone()
-        return {"id":res[0],"name":res[1],"data":res[2],"creationTime":res[3],"modifiedTime":res[4],"uploader":res[5],"session":res[6],"emphemeral":res[7]}
-    
+        self.cursor.execute("SELECT * FROM files WHERE id = %s", [id])
+        res = self.cursor.fetchone()  # type: ignore
+        return {
+            "id": res[0],  # type: ignore
+            "name": res[1],  # type: ignore
+            "data": res[2],  # type: ignore
+            "creationTime": res[3],  # type: ignore
+            "modifiedTime": res[4],  # type: ignore
+            "uploader": res[5],  # type: ignore
+            "session": res[6],  # type: ignore
+            "emphemeral": res[7],  # type: ignore
+        }
+
    def store_transcription(self, file_id, file_transcription):
        """
        Stores a transcription record in the database.
@ -61,16 +68,28 @@ class Database:
            file_transcription (list): A list of transcription entries to store, each containing "start", "end", and "value" keys.
        """
        file_transcription_id = str(uuid.uuid4())
-        self.cursor.execute("""
+        self.cursor.execute(
+            """
                            INSERT INTO file_transcription (id, file)
                            VALUES (%s, %s);
-                            """,[file_transcription_id,file_id])
+                            """,
+            [file_transcription_id, file_id],
+        )
        for transcription in file_transcription:
-            self.cursor.execute("""
+            self.cursor.execute(
+                """
                            INSERT INTO transcription (id, file_transcription, start, "end", value)
                            VALUES (%s, %s, %s, %s, %s);
-                            """,[str(uuid.uuid4()),file_transcription_id,transcription["start"],transcription["end"],transcription["value"]])
-    
+                            """,
+                [
+                    str(uuid.uuid4()),
+                    file_transcription_id,
+                    transcription["start"],
+                    transcription["end"],
+                    transcription["value"],
+                ],
+            )
+
    def get_transcriptions(self, file_id):
        """
        Fetches transcriptions associated with a file from the database.
@ -81,24 +100,36 @@ class Database:
        Returns:
            list: A list of lists containing transcription entries, where each inner list represents a file transcription and contains dictionaries with "start", "end", and "value" keys.
        """
-        self.cursor.execute("""
+        self.cursor.execute(
+            """
                           SELECT id FROM file_transcription
-                           WHERE file = %s 
-                           """,[file_id])
+                           WHERE file = %s
+                           """,
+            [file_id],
+        )
        file_transcriptions = self.cursor.fetchall()
        res = []
        for file_transcription in file_transcriptions:
-            self.cursor.execute("""
+            self.cursor.execute(
+                """
                           SELECT start, "end", value FROM transcription
-                           WHERE file_transcription = %s 
-                           """,[file_transcription[0]])
+                           WHERE file_transcription = %s
+                           """,
+                [file_transcription[0]],
+            )
            transcriptions = self.cursor.fetchall()
            parsed_file_transcriptions = []
            for transcription in transcriptions:
-                parsed_file_transcriptions.append({"start":transcription[0],"end":transcription[1],"value":transcription[2]})
+                parsed_file_transcriptions.append(
+                    {
+                        "start": transcription[0],
+                        "end": transcription[1],
+                        "value": transcription[2],
+                    }
+                )
            res.append(parsed_file_transcriptions)
        return res
-    
+
    def close(self):
        """
        Closes the database connection and cursor.
--- a/backend/spectral/frame_analysis.py
+++ b/backend/spectral/frame_analysis.py
@ -1,6 +1,7 @@
 import parselmouth
 import numpy as np

+
 def simple_frame_info(frame, fs, frame_info):
    """
    Extracts and returns basic information from a given audio frame.
@ -30,15 +31,15 @@ def simple_frame_info(frame, fs, frame_info):
    """
    if frame_info is None:
        return None
-    data = frame[frame_info["startIndex"]:frame_info["endIndex"]]
+    data = frame[frame_info["startIndex"] : frame_info["endIndex"]]
    res = {}
-    res["duration"] = calculate_frame_duration(data,fs)
-    res["pitch"] = calculate_frame_pitch(data,fs) 
-    formants = calculate_frame_f1_f2(data,fs) 
+    res["duration"] = calculate_frame_duration(data, fs)
+    res["pitch"] = calculate_frame_pitch(data, fs)
+    formants = calculate_frame_f1_f2(data, fs)
    res["f1"] = formants[0]
    res["f2"] = formants[1]
    return res
-      
+

 def calculate_frame_duration(frame, fs):
    """
--- a/backend/spectral/main.py
+++ b/backend/spectral/main.py
@ -14,18 +14,18 @@ from .frame_analysis import (
    calculate_frame_f1_f2,
 )
 from .mode_handler import (
-    simple_info_mode, 
-    spectogram_mode, 
+    simple_info_mode,
+    spectogram_mode,
    vowel_space_mode,
-    transcription_mode
-)
-from .transcription import (
-    get_transcription
+    transcription_mode,
 )
+from .transcription import get_transcription
 from .data_objects import Frame, Signal
 from .database import Database
 import orjson
 import io
+
+
 import os
 from pydub import AudioSegment

@ -154,10 +154,8 @@ async def analyze_signal_mode(
    try:
        file = database.fetch_file(id)
    except Exception as _:
-        raise HTTPException(
-            status_code=404, detail="File not found"
-        )
-        
+        raise HTTPException(status_code=404, detail="File not found")
+
    audio = AudioSegment.from_file(io.BytesIO(file["data"]))
    fs = audio.frame_rate
    data = audio.get_array_of_samples()
@ -176,6 +174,7 @@ async def analyze_signal_mode(
        case _:
            raise HTTPException(status_code=400, detail="Mode not found")

+
@app.get("/transcription/{model}/{id}")
 async def transcribe_file(
    model: Annotated[str, Path(title="The transcription model")],
@ -193,7 +192,7 @@ async def transcribe_file(
    - id (str): The ID of the file to transcribe.

    Returns:
-    - list: A list of dictionaires with keys 'start', 'end' and 'value' containing the transcription of the audio file.
+    - list: A list of dictionaries with keys 'start', 'end' and 'value' containing the transcription of the audio file.

    Raises:
    - HTTPException: If the file is not found or an error occurs during transcription or storing the transcription.
@ -201,15 +200,14 @@ async def transcribe_file(
    try:
        file = database.fetch_file(id)
    except Exception as _:
-        raise HTTPException(
-            status_code=404, detail="File not found"
-        )
-    transcription = get_transcription(model,file)
+        raise HTTPException(status_code=404, detail="File not found")
+    transcription = get_transcription(model, file)
    try:
-        database.store_transcription(id,transcription)
+        database.store_transcription(id, transcription)
    except Exception as _:
        raise HTTPException(
-            status_code=500, detail="Something went wrong while storing the transcription"
+            status_code=500,
+            detail="Something went wrong while storing the transcription",
        )
    return transcription

--- a/backend/spectral/mode_handler.py
+++ b/backend/spectral/mode_handler.py
@ -1,12 +1,10 @@
 from fastapi import HTTPException

 from .signal_analysis import simple_signal_info
-from .frame_analysis import (
-    simple_frame_info,
-    calculate_frame_f1_f2
-)
+from .frame_analysis import simple_frame_info, calculate_frame_f1_f2

-def simple_info_mode(data,fs,file,frame_index):
+
+def simple_info_mode(data, fs, file, frame_index):
    """
    Extracts and returns basic information about a signal and its corresponding frame.

@ -28,22 +26,22 @@ def simple_info_mode(data,fs,file,frame_index):
    result = simple_info_mode(data, fs, file, frame_index)
    ```
    """
-    result = simple_signal_info(data,fs)
+    result = simple_signal_info(data, fs)
    result["fileSize"] = len(file["data"])
    result["fileCreationDate"] = file["creationTime"]
-    result["frame"] = simple_frame_info(data,fs,frame_index)
+    result["frame"] = simple_frame_info(data, fs, frame_index)
    return result
-    
-def spectogram_mode(data,fs,frame_index):
-    """ TBD
+
+
+def spectogram_mode(data, fs, frame_index):
+    """TBD
    Raises:
        HTTPException: 501 not implemented
    """
-    raise HTTPException(
-        status_code=501, detail="spectogram_mode is not implemented"
-    )
+    raise HTTPException(status_code=501, detail="spectogram_mode is not implemented")

-def vowel_space_mode(data,fs,frame_index):
+
+def vowel_space_mode(data, fs, frame_index):
    """
    Extracts and returns the first and second formants of a specified frame.

@ -71,11 +69,12 @@ def vowel_space_mode(data,fs,frame_index):
        raise HTTPException(
            status_code=400, detail="Vowel-space mode was not given frame"
        )
-    frame_data = data[frame_index["startIndex"]:frame_index["endIndex"]]
-    formants = calculate_frame_f1_f2(frame_data,fs)
-    return {"f1":formants[0],"f2":formants[1]}
+    frame_data = data[frame_index["startIndex"] : frame_index["endIndex"]]
+    formants = calculate_frame_f1_f2(frame_data, fs)
+    return {"f1": formants[0], "f2": formants[1]}

-def transcription_mode(id,database):
+
+def transcription_mode(id, database):
    """
    Retrieve transcriptions of a file from the database.

@ -95,7 +94,6 @@ def transcription_mode(id,database):
        return database.get_transcriptions(id)
    except Exception as _:
        raise HTTPException(
-            status_code=500, detail="Something went wrong when retrieving the transcriptions of this file"
+            status_code=500,
+            detail="Something went wrong when retrieving the transcriptions of this file",
        )
-    
-    
--- a/backend/spectral/signal_analysis.py
+++ b/backend/spectral/signal_analysis.py
@ -1,6 +1,7 @@
 import parselmouth
 import numpy as np

+
 def simple_signal_info(signal, fs):
    """
    Extracts and returns basic information from a given audio signal.
@ -19,9 +20,12 @@ def simple_signal_info(signal, fs):
    result = simple_signal_info(signal, fs)
    ```
    """
-    duration = calculate_signal_duration(signal=signal,fs=fs)
-    avg_pitch = np.mean(calculate_sound_pitch(signal_to_sound(signal=signal,fs=fs))["data"]).item()
-    return {"duration":duration,"averagePitch":avg_pitch}
+    duration = calculate_signal_duration(signal=signal, fs=fs)
+    avg_pitch = np.mean(
+        calculate_sound_pitch(signal_to_sound(signal=signal, fs=fs))["data"]  # type: ignore
+    ).item()
+    return {"duration": duration, "averagePitch": avg_pitch}
+

 def signal_features(signal, fs):
    """
@ -52,7 +56,7 @@ def signal_features(signal, fs):
        "spectogram": spectrogram,
        "formants": formants,
    }
-    
+

 def signal_to_sound(signal, fs):
    """
--- a/backend/spectral/transcription.py
+++ b/backend/spectral/transcription.py
@ -2,6 +2,7 @@ from deepgram import DeepgramClient, PrerecordedOptions, FileSource
 from fastapi import HTTPException
 import os

+
 def get_transcription(model, file):
    """
    Get transcription of an audio file using the specified model.
@ -22,10 +23,8 @@ def get_transcription(model, file):
        case "deepgram":
            return deepgram_transcription(file["data"])
        case _:
-            raise HTTPException(
-                status_code=404, detail="Model was not found"
-            )
-            
+            raise HTTPException(status_code=404, detail="Model was not found")
+

 def deepgram_transcription(data):
    """
@ -44,7 +43,12 @@ def deepgram_transcription(data):
    """
    try:
        # STEP 1: Create a Deepgram client using the API key
-        deepgram = DeepgramClient(os.getenv("DG_KEY"))
+        key = os.getenv("DG_KEY")
+        deepgram = None
+        if key is None:
+            raise Exception("No API key for Deepgram is found")
+        else:
+            deepgram = DeepgramClient(key)

        payload: FileSource = {
            "buffer": data,
@ -59,11 +63,13 @@ def deepgram_transcription(data):

        # STEP 3: Call the transcribe_file method with the text payload and options
        response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
-        
+
        res = []
        for word in response["results"]["channels"][0]["alternatives"][0]["words"]:
-            res.append({"value":word["word"],"start":word["start"],"end":word["end"]})
+            res.append(
+                {"value": word["word"], "start": word["start"], "end": word["end"]}
+            )
        return res

    except Exception as e:
-        print(f"Exception: {e}")
+        print(f"Exception: {e}")
--- a/backend/tests/data/frames.json
+++ b/backend/tests/data/frames.json
--- a/backend/tests/test_fast_api.py
+++ b/backend/tests/test_fast_api.py
@ -8,11 +8,14 @@ import os
 client = TestClient(app)

 # Load the JSON file
-with open(os.path.join(os.path.realpath(__file__),"../data/frames.json"), "r") as file:
+with open(os.path.join(os.path.realpath(__file__), "../data/frames.json"), "r") as file:
    frame_data = json.load(file)

 typical_1_fs, typical_1_data = wv.read(
-    os.path.join(os.path.realpath(__file__),"../data/torgo-dataset/MC02_control_head_sentence1.wav")
+    os.path.join(
+        os.path.realpath(__file__),
+        "../data/torgo-dataset/MC02_control_head_sentence1.wav",
+    )
 )
 typical_1_data = typical_1_data.tolist()

--- a/backend/tests/test_frame_analysis.py
+++ b/backend/tests/test_frame_analysis.py
@ -9,7 +9,7 @@ import json
 import os

 # Load the JSON file
-with open(os.path.join(os.path.realpath(__file__),"../data/frames.json"), "r") as file:
+with open(os.path.join(os.path.realpath(__file__), "../data/frames.json"), "r") as file:
    frame_data = json.load(file)


--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@ -47,7 +47,7 @@ services:
    env_file:
       - '.variables.env'
       - '.apikeys.env'
-      
+

 volumes:
  postgres_data:
@ -55,4 +55,3 @@ volumes:
 networks:
  default:
    external: false
-
--- a/docs/CODE_OF_CONDUCT.md
+++ b/docs/CODE_OF_CONDUCT.md
@ -146,4 +146,4 @@ And, of course, we care about the project and building a good product. We are mo
 4. Hard-working
 5. Willingness to help others when in need
 6. Good attitude toward project/teammates
-7. Willingness to propose novel, disrupting solutions
+7. Willingness to propose novel, disrupting solutions
--- a/docs/issues-and-mrs.md
+++ b/docs/issues-and-mrs.md
@ -3,7 +3,7 @@
 - **Type**: Choose Incident if it’s a bug, otherwise just a normal issue
 - **Description**: Follow a template
 - **Assignees**: assign yourself. if there is a person who is an expert in the topic, and will help you, you should also assign them
- **Reviewers**: Assign two people. If there is a particular person that should review, assign them. Otherwise assign people pseudo-arbitrarily, while trying to keep it evenly distributed. 
+- **Reviewers**: Assign two people. If there is a particular person that should review, assign them. Otherwise assign people pseudo-arbitrarily, while trying to keep it evenly distributed.
 - **Milestone**: one of MVP (0.1), 1.0
 - **Iteration**: Corresponds to each weekly sprint (from 1 to 10, deadlines are on Mondays)
 - **Labels**: any number of
@ -43,4 +43,4 @@

 - Try to merge MRs in 3 days or less. The limit is 5 days to merge an MR, otherwise we will just close it, as we don't want to clutter the repo with stale MRs (barring exceptional cases, that should be justified). To facilitate this...
 - Make MRs as small as possible and generally self-contained.
- Squash commits if history is messy/too detailed. 
+- Squash commits if history is messy/too detailed.
--- a/frontend/src/lib/analysis/AnalysisPane.svelte
+++ b/frontend/src/lib/analysis/AnalysisPane.svelte
@ -3,7 +3,7 @@

 	A pane of analysis. TODO: Expand docs

-	The analysis pane is responsible for fetching the data that the mode components need. 
+	The analysis pane is responsible for fetching the data that the mode components need.
 -->

 <script lang="ts">
--- a/frontend/src/routes/session/[sessionId]/FileExplorer.svelte
+++ b/frontend/src/routes/session/[sessionId]/FileExplorer.svelte
@ -1,7 +1,7 @@
 <!--
 	@component

-	File explorer for the audio files. 
+	File explorer for the audio files.
 -->

 <script lang="ts">