initial import and jupyter notebook to check for data issues

2026-06-01 17:34:41 +01:00
parent ff48d58018
commit 326c86b9a4
5 changed files with 208 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,16 @@
 # ---> Python
 # Data generated
 data/*.db
 *.xlsx
 # Ignore local virtual environments
 venv/
 .venv/
 # Ignore Jupyter Notebook checkpoints
 .ipynb_checkpoints/
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
--- a/notebooks/001_data_cleaning_checks.ipynb
+++ b/notebooks/001_data_cleaning_checks.ipynb
@@ -0,0 +1,45 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8d393e6d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import os\n",
    "import sqlite3\n",
    "import pandas as pd\n",
    "\n",
    "# 1. Resolve repository pathing and connect to SQLite\n",
    "BASE_DIR = os.path.dirname(os.getcwd())\n",
    "DB_PATH = os.path.join(BASE_DIR, \"data\", \"met_office_weather.db\")\n",
    "conn = sqlite3.connect(DB_PATH)\n",
    "\n",
    "# 2. Extract dataset profile\n",
    "df = pd.read_sql_query(\"SELECT * FROM historic_weather\", conn)\n",
    "print(\"--- Dataset Shape ---\")\n",
    "print(f\"Total Rows: ${df.shape[0]}, Total column: ${df.shape[1]}\\n\")\n",
    "\n",
    "print (\"--- Column Data Types & Counts ---\")\n",
    "print(df.info())\n",
    "\n",
    "print(\"\\n--- Missing Values (NaN) Per Feature Column ---\")\n",
    "print(df.isnull().sum())\n",
    "\n",
    "print(\"\\n --- Total Row Logs Collected Per Unique Station ---\")\n",
    "print(df[\"station_name\"].value_counts())\n",
    "\n",
    "conn.close()"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,9 @@
 requests==2.31.0
 beautifulsoup4==4.12.3
 pandas===2.2.3
 openpyxl===3.1.5
 numpy===2.2.3
 scipy==1.17.1
 matplotlib==3.10.0
 seaborn==0.13.2
 statsmodels==0.14.4
--- a/src/init.py
+++ b/src/init.py
--- a/src/ingest_data.py
+++ b/src/ingest_data.py
@@ -0,0 +1,142 @@
 import io
 import os
 import re
 import sqlite3
 import pandas as pd
 import requests
 from bs4 import BeautifulSoup
 # Define file paths relative to this script's position
 # This ensures it resolves correctly regardless of where it is executed from
 BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 DB_PATH = os.path.join(BASE_DIR, "data", "met_office_weather.db")
 def setup_database():
    """Initialises the database connection and ensures directories exist"""
    os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)
    return sqlite3.connect(DB_PATH)
 def fetch_station_urls():
    """Scrapes the Met Office landing page to collect all valid historic station data text links."""
    main_url = "https://www.metoffice.gov.uk/research/climate/maps-and-data/historic-station-data"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
    print("🔍 Fetching station links from the Met Office...")
    response = requests.get(main_url, headers=headers)
    response.raise_for_status() # should account for fail
    soup = BeautifulSoup(response.text, "html.parser")
    links = set()
    for link in soup.find_all("a", href=True):
        href = str(link["href"])
        if href.endswith("data.txt"):
            if not href.startswith("http"):
                href = "https://www.metoffice.gov.uk" + href
            links.add(href)
    print(f"✅ Found {len(links)} weather stations.")
    return list(links)
 def clean_and_parse_data(url, raw_text):
    """Cleans up the raw Met Office text, drops header metadata and handles uneven spacing"""
    lines = raw_text.split("\n")
    station_name = lines[0].strip()
    header_idx = None
    for idx, line in enumerate(lines):
        if "yyyy" in line and "mm" in line:
            header_idx = idx
            break
    if header_idx is None:
        raise ValueError("Could not find a valid header row containing 'yyyy mm'")
    # Extract the header line and use it to determine the structural column count
    header_line = lines[header_idx]
    headers_list = [re.sub(r'[^a-zA-Z0-9]', '', col) for col in header_line.split() if col.strip()]
    expected_col_count = len(headers_list)
    # Process data lines safety row-by-row to eliminate trailing anomalies
    cleaned_rows = [header_line]
    for line in lines[header_idx + 2:]:
        line_stripped = line.strip()
        if not line_stripped:
            continue
        # split line by its arbritary whitespace tokens
        tokens = line_stripped.split()
        # If the row has more columns than headers, slice off the stray text
        if len(tokens) > expected_col_count:
            tokens = tokens[:expected_col_count]
        # Reconstruct the line with clean uniform spacing
        cleaned_rows.append(" ".join(tokens))
    # Reassemble our safe dataset payload
    clean_payload = "\n".join(cleaned_rows)
    # Read text using regex split due to arbritary white spacing
    # skiprows=[1] safety drops the units line (e.g. "degC") directly under the headers
    df = pd.read_csv(io.StringIO(clean_payload), sep=r"\s+")
    # Standardise column headers (remove special chars)
    df.columns = headers_list
    # Add identity tracking feature column using grabbed name from Line 1
    df["station_name"] = station_name
    # Strict statistical data cleaning
    for col in df.columns:
        if col!= "station_name":
            # Chain the string replacement safety
            clean_series = (
                df[col].astype(str)
                .str.replace("*", "", regex=False)
                .str.replace("#", "", regex=False)
                .str.replace("---", "", regex=False)
            )
            # Turn text to numerical floats. Empty blocks naturally flip to NaN
            df.loc[:, col] = pd.to_numeric(clean_series, errors="coerce")
    df = df.dropna(subset=["yyyy", "mm"])
    df.loc[:, "yyyy"] = df["yyyy"].astype(int)
    df.loc[:, "mm"] = df["mm"].astype(int)
    return df
 def main():
    conn = setup_database()
    urls = fetch_station_urls()
    # Clear out any historical records to prevent compounding data duplications
    cursor = conn.cursor()
    cursor.execute("DROP TABLE IF EXISTS historic_weather")
    conn.commit()
    success_count = 0
    for url in urls:
        try:
            res = requests.get(url, timeout=15)
            res.raise_for_status()
            df = clean_and_parse_data(url, res.text)
            df.to_sql("historic_weather", conn, if_exists="append", index=False)
            success_count += 1
            print(f"Successfully processed {df['station_name'].iloc[0]}")
        except Exception as e:
            print(f"❌ Failed to process link {url}. Error: {e}")
    conn.close()
    print(f"\n Pipelines complete! Loaded {success_count}/{len(urls)} stations into '{DB_PATH}'.")
 if __name__ == "__main__":
    main()