import base64
import io
import json
import logging
import re
import tarfile
from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from typing import Dict, Optional
import requests
import streamlit as st
from pages.utils.module_registry import get_all_modules
logger = logging.getLogger(__name__)
[docs]
def get_base64_image(path):
with open(path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
[docs]
def stat_box(title, value, icon_path, url=None):
img_data = get_base64_image(icon_path)
content = f"""
<div style="
background: #fff;
border-radius: 12px;
box-shadow: 0 4px 16px rgba(0,0,0,0.06);
padding: 12px;
text-align: center;
min-height: 120px;
display: flex;
flex-direction: column;
justify-content: center;
align-items: center;
transition: transform 0.2s ease;
">
<div style="margin-bottom: 8px;">
<img src="data:image/png;base64,{img_data}" alt="icon" style="width: 36px; height: 36px;" />
</div>
<div style="color: #37475E; font-weight: 600; font-size: 0.95rem; text-align: center;">{title}</div>
<div style="font-size: 1.5rem; font-weight: 700; margin-top: 6px; color: #37475E;">{value}</div>
</div>
"""
if url:
return f"""<a href="{url}" target="_blank" style="text-decoration: none;">{content}</a>"""
return content
[docs]
def get_n_modules():
"""
Get the number of modules in ProteoBench.
Returns
-------
int
The number of modules.
"""
# The number of modules is defined by the number of .py files in the pages directory that are not __init__.py
pages_dir = Path(__file__).parent / "pages"
n_modules = len(
[f for f in pages_dir.glob("*.py") if not f.name == "__init__.py" and not f.name.startswith("base")]
)
return n_modules
[docs]
def get_n_submitted_points(url: str = "https://proteobench.cubimed.rub.de/datasets/"):
"""
Get the number of submitted points in ProteoBench.
Returns
-------
int
The number of submitted points (excluding 'fasta/' and 'raw_files/').
"""
exclude_dirs = {"fasta/", "raw_files/", "../"}
try:
response = requests.get(url)
response.raise_for_status()
html = response.text
# Find all hrefs that end in /
dirs = re.findall(r'href="([^"]+/)"', html)
# Remove unwanted dirs
submitted_dirs = set(dirs) - exclude_dirs
return len(submitted_dirs)
except requests.RequestException:
return "Error communicating with the server"
# TODO: perhaps proposed modules should be parsed using the GitHub discussion but there doesnt seem to be an API endpoint for this
[docs]
def parse_proteobench_index(rst_text: str) -> Dict[str, int]:
"""
Parses the ProteoBench index.rst and counts modules by status.
This version assumes that each module starts with '.. grid-item-card::'
and that the badge line contains ':bdg-' followed by the status.
Parameters
----------
rst_text : str
The text content of the index.rst file.
Returns
-------
Dict[str, int]
Dictionary mapping statuses to counts.
"""
status_counter = Counter()
# Split into sections by grid-item-card
sections = rst_text.split(".. grid-item-card::")
# First part is before first card
for section in sections[1:]: # Skip first part before first card
lines = section.strip().splitlines()
status_found = False
for line in lines:
line = line.strip()
if line.startswith(":bdg-") and ":`" in line:
# Example: :bdg-success:`active`
try:
status = line.split(":`")[1].rstrip("`").strip().lower()
status_counter[status] += 1
status_found = True
break
except IndexError:
continue
if not status_found:
# If no badge found, you can log or skip
pass
return dict(status_counter)
[docs]
def get_n_modules_proposed(rst_text: str) -> int:
"""
Computes the number of proposed modules as the sum of modules
'in discussion' and 'in development'.
Parameters
----------
status_counts : Dict[str, int]
A dictionary of status counts as returned by parse_proteobench_index().
Returns
-------
int
The total number of proposed modules.
"""
status_counts = parse_proteobench_index(rst_text)
return status_counts.get("in discussion", 0) + status_counts.get("in development", 0)
[docs]
def get_monthly_visits(api_endpoint: str, token: str, id_site: int) -> Optional[int]:
"""
Gets the monthly visits count from the Matomo API.
Parameters
----------
api_endpoint : str
The API endpoint URL of the Matomo installation
token : str
The authentication token (from Matomo)
id_site : int
The site ID (from Matomo)
Returns
-------
Optional[int]
The number of monthly visits (nb_visits of last 30 days), or
``None`` if retrieval/parsing failed.
"""
# data to be sent to api
data = {
"module": "API",
"method": "Actions.getPageTitles",
"idSite": id_site,
"period": "day",
"date": "last30",
"format": "json",
"token_auth": token,
}
try:
r = requests.post(url=api_endpoint, data=data, timeout=10)
r.raise_for_status()
json_visits = json.loads(r.text)
visits_count = 0
for _, visits in json_visits.items():
if len(visits) > 0:
for page in visits:
visits_count += page.get("nb_visits", 0)
return visits_count
except (requests.RequestException, json.JSONDecodeError, KeyError):
logger.warning("Failed to retrieve or parse monthly visits from Matomo API", exc_info=True)
return None
[docs]
@st.cache_data(ttl=3600, show_spinner=False)
def get_module_submission_data() -> Dict[str, Dict[str, int]]:
"""
Fetch submission data per module by downloading repo archives.
Returns per-tool submission counts for each module.
Requests are made concurrently to minimize latency.
Returns
-------
Dict[str, Dict[str, int]]
Mapping of results_repo name to {software_name: count} dict.
"""
modules_by_category = get_all_modules()
headers = {}
try:
token = st.secrets["gh"]["token"]
headers["Authorization"] = f"token {token}"
except Exception:
logger.warning(
"Could not obtain GitHub token, proceeding with unauthenticated requests which may be rate-limited."
)
pass
repo_names = [
module.results_repo for modules in modules_by_category.values() for module in modules if module.results_repo
]
def _fetch_tool_breakdown(repo_name: str) -> tuple:
try:
url = f"https://api.github.com/repos/Proteobench/{repo_name}/tarball/main"
resp = requests.get(url, headers=headers, timeout=30)
resp.raise_for_status()
except requests.RequestException:
logger.warning("Failed to download archive for %s", repo_name, exc_info=True)
return repo_name, {}
tools = Counter()
try:
with tarfile.open(fileobj=io.BytesIO(resp.content), mode="r:gz") as tar:
for member in tar.getmembers():
if member.name.endswith(".json") and member.isfile():
try:
f = tar.extractfile(member)
data = json.loads(f.read())
tools[data.get("software_name", "Unknown")] += 1
except (json.JSONDecodeError, KeyError, OSError):
logger.warning("Skipping malformed file %s in %s", member.name, repo_name, exc_info=True)
except tarfile.TarError:
logger.warning("Failed to read archive for %s", repo_name, exc_info=True)
return repo_name, {}
return repo_name, dict(tools)
if not repo_names:
return {}
result: Dict[str, Dict[str, int]] = {}
with ThreadPoolExecutor(max_workers=min(len(repo_names), 10)) as executor:
futures = {executor.submit(_fetch_tool_breakdown, name): name for name in repo_names}
for future in as_completed(futures):
repo_name, tool_counts = future.result()
result[repo_name] = tool_counts
return result
if __name__ == "__main__":
# This block is only for testing purposes
print(f"Number of modules: {get_n_modules()}")
print(f"Number of submitted points: {get_n_submitted_points()}")
print(f"Number of supported tools: {get_n_supported_tools()}")
file_path = Path(__file__).parent.parent / "docs" / "index.rst"
status_counts = parse_proteobench_index(file_path.read_text(encoding="utf-8"))
print(f"Number of proposed modules: {get_n_modules_proposed(file_path.read_text(encoding='utf-8'))}")