Source code for coord2region.coord2study

"""Utilities for mapping coordinates to neuroimaging studies.

This module fetches, converts, and queries NiMARE-compatible datasets (e.g.,
Neurosynth, NeuroQuery, and NIDM-Pain) and assembles study metadata for
coordinates of interest.
"""

import logging
import os
import re
from typing import Any

import requests
from Bio import Entrez, Medline
from nimare.dataset import Dataset
from nimare.extract import download_nidm_pain, fetch_neuroquery, fetch_neurosynth
from nimare.io import convert_neurosynth_to_dataset
from nimare.utils import get_resource_path

from .utils import resolve_working_directory


[docs]
logger = logging.getLogger(__name__)



def _fetch_crossref_metadata(pmid: str) -> dict[str, str | None]:
    """Fetch title and abstract from CrossRef using a PMID.

    Parameters
    ----------
    pmid : str
        PubMed identifier for the study.

    Returns
    -------
    dict[str, Optional[str]]
        Dictionary containing ``title`` and ``abstract`` keys when available.
    """
    try:
        # Crossref does not support a 'pmid' filter; many records expose PubMed
        # IDs via 'alternative-id'. Use that when possible.
        headers = {"User-Agent": "coord2region (mailto:example@example.com)"}
        url = f"https://api.crossref.org/works?filter=alternative-id:{pmid}"
        resp = requests.get(url, headers=headers, timeout=5)
        resp.raise_for_status()
        data = resp.json().get("message", {}).get("items", [])
        if not data:
            return {}
        item = data[0]
        title_list = item.get("title", [])
        title = title_list[0] if title_list else None
        abstract = item.get("abstract")
        if abstract:
            # Remove simple XML/HTML tags
            abstract = re.sub("<[^>]+>", "", abstract).strip()
        return {"title": title, "abstract": abstract}
    except requests.HTTPError as exc:  # pragma: no cover - network errors
        # Downgrade common "bad request/not found" cases to info to reduce noise
        status = getattr(exc.response, "status_code", None)
        level = logger.info if status in (400, 404) else logger.warning
        level(f"Failed to fetch CrossRef metadata for PMID {pmid}: {exc}")
        return {}
    except Exception as exc:  # pragma: no cover - network errors
        logger.warning(f"Failed to fetch CrossRef metadata for PMID {pmid}: {exc}")
        return {}



[docs]
def fetch_datasets(
    data_dir: str, sources: list[str] | None = None
) -> dict[str, Dataset]:
    """Fetch and convert NiMARE datasets into ``Dataset`` objects.

    Parameters
    ----------
    data_dir : str
        Directory to store downloaded data.
    sources : Optional[List[str]], optional
        List of dataset names to download. Valid entries include ``"neurosynth"``,
        ``"neuroquery"`` and ``"nidm_pain"``. If ``None`` (default), all
        available datasets are fetched.

    Returns
    -------
    Dict[str, Dataset]
        Dictionary of NiMARE ``Dataset`` objects indexed by dataset name.
    """
    datasets: dict[str, Dataset] = {}
    os.makedirs(data_dir, exist_ok=True)

    # Determine which sources to fetch
    if sources is None:
        requested = {"neurosynth", "neuroquery", "nidm_pain"}
    else:
        requested = {src.lower() for src in sources}

    if "neurosynth" in requested:
        try:
            ns_files = fetch_neurosynth(
                data_dir=data_dir,
                version="7",
                source="abstract",
                vocab="terms",
                overwrite=False,
            )
            ns_data = ns_files[0]
            neurosynth_dset = convert_neurosynth_to_dataset(
                coordinates_file=ns_data["coordinates"],
                metadata_file=ns_data["metadata"],
                annotations_files=ns_data.get("features"),
            )
            datasets["Neurosynth"] = neurosynth_dset
            logger.info("Neurosynth dataset loaded successfully.")
        except Exception as e:  # pragma: no cover - network errors
            logger.warning(f"Failed to fetch/convert Neurosynth dataset: {e}")

    if "neuroquery" in requested:
        try:
            nq_files = fetch_neuroquery(
                data_dir=data_dir,
                version="1",
                source="combined",
                vocab="neuroquery6308",
                type="tfidf",
                overwrite=False,
            )
            nq_data = nq_files[0]
            neuroquery_dset = convert_neurosynth_to_dataset(
                coordinates_file=nq_data["coordinates"],
                metadata_file=nq_data["metadata"],
                annotations_files=nq_data.get("features"),
            )
            datasets["NeuroQuery"] = neuroquery_dset
            logger.info("NeuroQuery dataset loaded successfully.")
        except Exception as e:  # pragma: no cover - network errors
            logger.warning(f"Failed to fetch/convert NeuroQuery dataset: {e}")

    if "nidm_pain" in requested:
        if download_nidm_pain is not None and get_resource_path is not None:
            try:
                _ = download_nidm_pain(data_dir=data_dir, overwrite=False)
                dset_file = os.path.join(get_resource_path(), "nidm_pain_dset.json")
                nidm_pain_dset = Dataset(dset_file, target="mni152_2mm", mask=None)
                datasets["NIDM-Pain"] = nidm_pain_dset
                logger.info("NIDM-Pain dataset loaded successfully.")
            except Exception as e:  # pragma: no cover - network errors
                logger.warning(f"Failed to fetch/convert NIDM-Pain dataset: {e}")
        else:  # pragma: no cover - optional dependency
            logger.warning(
                "Skipping NIDM-Pain: required NiMARE helpers are not available."
            )

    if sources is not None:
        unknown = requested - {"neurosynth", "neuroquery", "nidm_pain"}
        for src in unknown:
            logger.warning(f"Unknown dataset source '{src}' requested; skipping.")

    if not datasets:
        logger.warning("No datasets were loaded.")
    return datasets




[docs]
def load_deduplicated_dataset(filepath: str) -> Dataset | None:
    """Load a previously saved deduplicated dataset.

    Parameters
    ----------
    filepath : str
        Path to the saved dataset file (``.pkl.gz``).

    Returns
    -------
    Optional[Dataset]
        The loaded ``Dataset`` object, or ``None`` if loading fails.
    """
    if not os.path.exists(filepath):
        logger.warning(f"Dataset file not found: {filepath}")
        return None

    try:
        dataset = Dataset.load(filepath, compressed=True)
        logger.info(
            "Loaded deduplicated dataset with %d studies from %s",
            len(dataset.ids),
            filepath,
        )
        return dataset
    except Exception as e:
        logger.error(f"Failed to load dataset from {filepath}: {e}")
        return None




[docs]
def deduplicate_datasets(
    datasets: dict[str, Dataset], save_dir: str | None = None
) -> Dataset | None:
    """Create a deduplicated dataset across sources using PMIDs.

    Duplicates are identified via PubMed IDs extracted from study identifiers.
    Datasets are merged sequentially using :meth:`~nimare.dataset.Dataset.merge`,
    introduced in NiMARE 0.0.9. Older NiMARE versions without ``merge`` will
    return the first dataset unchanged.

    Parameters
    ----------
    datasets : Dict[str, Dataset]
        Dictionary of NiMARE ``Dataset`` objects to deduplicate.
    save_dir : Optional[str], default=None
        Directory to save the deduplicated dataset (if provided).

    Returns
    -------
    Optional[Dataset]
        A deduplicated NiMARE ``Dataset`` combining all inputs, or ``None`` if
        no datasets were provided.
    """
    if not datasets:
        logger.warning("No datasets provided for deduplication.")
        return None

    dataset_list = list(datasets.values())
    tracked_pmids: set[str] = set()
    merged_dataset: Dataset | None = None

    for dset in dataset_list:
        ids_to_include: list[str] = []
        for sid in dset.ids:
            pmid = str(sid).split("-")[0]
            if pmid not in tracked_pmids:
                tracked_pmids.add(pmid)
                ids_to_include.append(sid)
        if not ids_to_include:
            continue
        subset = dset.slice(ids_to_include) if hasattr(dset, "slice") else dset
        if merged_dataset is None:
            merged_dataset = subset
        elif hasattr(merged_dataset, "merge"):
            merged_dataset = merged_dataset.merge(subset)
        else:  # pragma: no cover - merge method unavailable
            logger.warning("Dataset.merge not available; returning first dataset only.")
            break

    if merged_dataset is None:
        logger.warning("No studies remained after deduplication.")
        return None

    logger.info(f"Created deduplicated dataset with {len(merged_dataset.ids)} studies.")

    # Save the deduplicated dataset if requested
    if save_dir and isinstance(save_dir, str):
        os.makedirs(save_dir, exist_ok=True)
        save_path = os.path.join(save_dir, "deduplicated_dataset.pkl.gz")
        merged_dataset.save(save_path, compress=True)
        logger.info(f"Saved deduplicated dataset to {save_path}")

    return merged_dataset




[docs]
def prepare_datasets(
    data_dir: str | None = None,
    sources: list[str] | None = None,
) -> Dataset | None:
    """Load or create a deduplicated NiMARE dataset.

    Parameters
    ----------
    data_dir : str, optional
        Base directory for downloaded datasets and the merged cache. If ``None``
        (default) the path ``~/coord2region`` is used. Relative paths are
        interpreted relative to the user's home directory so that passing
        ``"my_cache"`` stores data in ``~/my_cache``.
    sources : Optional[List[str]], optional
        Dataset names to fetch if a cache needs to be built. See
        :func:`fetch_datasets` for valid entries. ``None`` (default) fetches all
        available datasets.

    Returns
    -------
    Optional[Dataset]
        The deduplicated NiMARE ``Dataset`` object, or ``None`` if preparation
        fails.
    """
    base_dir = resolve_working_directory(data_dir)
    base_dir.mkdir(parents=True, exist_ok=True)

    cache_dir = base_dir / "cached_data"
    cache_dir.mkdir(parents=True, exist_ok=True)

    dedup_path = cache_dir / "deduplicated_dataset.pkl.gz"

    if os.path.exists(dedup_path):
        dataset = load_deduplicated_dataset(str(dedup_path))
        if dataset is not None:
            return dataset

    datasets = fetch_datasets(str(base_dir), sources=sources)
    return deduplicate_datasets(datasets, save_dir=str(cache_dir))



def _extract_study_metadata(dset: Dataset, sid: Any) -> dict[str, Any]:
    """Extract title and abstract for a study.

    Parameters
    ----------
    dset : Dataset
        NiMARE ``Dataset`` containing the study.
    sid : Any
        Study identifier.

    Returns
    -------
    dict[str, Any]
        Dictionary with keys ``"id"``, ``"title"`` and, when available,
        ``"abstract"``. Metadata are drawn from NiMARE and supplemented with
        PubMed (via Biopython) or CrossRef when necessary.
    """
    study_entry: dict[str, Any] = {"id": str(sid)}

    title: str | None = None
    try:
        titles = dset.get_metadata(ids=[sid], field="title")
        if titles and titles[0] not in (None, "", "NaN"):
            title = titles[0]
    except Exception:
        pass

    # Fallback: try constructing a "title" from authors/year if possible
    if not title:
        try:
            authors = dset.get_metadata(ids=[sid], field="authors")
            year = dset.get_metadata(ids=[sid], field="year")
            if (
                authors
                and authors[0] not in (None, "", "NaN")
                and year
                and year[0] not in (None, "", "NaN")
            ):
                title = f"{authors[0]} ({year[0]})"
        except Exception:
            title = None

    if title:
        study_entry["title"] = title

    pmid = str(sid).split("-")[0]

    # Optionally retrieve abstract via Entrez if email provided and Bio available.
    if study_entry.get("id") and "email" in study_entry:
        try:
            handle = Entrez.efetch(
                db="pubmed", id=pmid, rettype="medline", retmode="text"
            )
            records = list(Medline.parse(handle))
            if records:
                rec = records[0]
                abstract_text = rec.get("AB")
                if abstract_text:
                    study_entry["abstract"] = abstract_text.strip()
                # Use PubMed title if we don't already have one
                if "title" not in study_entry:
                    pub_title = rec.get("TI")
                    if pub_title:
                        study_entry["title"] = pub_title.strip()
        except Exception as e:
            logger.warning(f"Failed to fetch abstract for PMID {pmid}: {e}")

    # Fallback to CrossRef to fill only missing fields
    needs_title = "title" not in study_entry or not study_entry.get("title")
    needs_abstract = "abstract" not in study_entry
    if needs_title or needs_abstract:
        crossref_data = _fetch_crossref_metadata(pmid)
        if needs_title and crossref_data.get("title"):
            study_entry["title"] = crossref_data["title"]
        if needs_abstract and crossref_data.get("abstract"):
            study_entry["abstract"] = crossref_data["abstract"]

    return study_entry



[docs]
def remove_duplicate_studies(studies: list[dict[str, Any]]) -> list[dict[str, Any]]:
    """Remove duplicate studies based on PMID.

    Parameters
    ----------
    studies : List[Dict[str, Any]]
        Study entries containing an ``"id"`` field with a ``PMID-`` prefix.

    Returns
    -------
    List[Dict[str, Any]]
        Unique study entries keyed by PMID.
    """
    unique: dict[str, dict[str, Any]] = {}
    for st in studies:
        full_id = st.get("id", "")
        pmid = full_id.split("-")[0]

        if pmid not in unique:
            unique[pmid] = st

    return list(unique.values())




[docs]
def search_studies(
    datasets: dict[str, Dataset],
    coord: list[float] | tuple[float, float, float],
    radius: float = 0,
    sources: list[str] | None = None,
    email: str | None = None,
) -> list[dict[str, Any]]:
    """Search selected datasets for studies reporting an MNI coordinate.

    Parameters
    ----------
    datasets : Dict[str, Dataset]
        NiMARE ``Dataset`` objects keyed by source name.
    coord : Union[List[float], Tuple[float, float, float]]
        MNI coordinate ``[x, y, z]``.
    radius : float, default=0
        Search radius in millimeters around the coordinate. ``0`` indicates an
        exact match.
    sources : Optional[List[str]], optional
        Specific dataset names to query. ``None`` searches across all provided
        datasets.
    email : Optional[str], optional
        Email address for Entrez (if abstract fetching is enabled).

    Returns
    -------
    List[Dict[str, Any]]
        Deduplicated list of study metadata dictionaries.
    """
    coord_list = [list(coord)]
    studies_info: list[dict[str, Any]] = []

    if sources is None:
        selected = datasets.items()
    else:
        selected = [(src, datasets[src]) for src in sources if src in datasets]

    for source, dset in selected:
        try:
            study_ids = dset.get_studies_by_coordinate(coord_list, r=radius)
        except Exception as e:  # pragma: no cover - underlying library errors
            logger.warning(
                f"Failed to search coordinate {coord} in {source} dataset: {e}"
            )
            continue

        if not study_ids:
            continue

        for sid in study_ids:
            study_entry = {"id": str(sid), "source": source}
            if email:
                Entrez.email = email
                study_entry["email"] = email

            study_metadata = _extract_study_metadata(dset, sid)
            study_entry.update(study_metadata)
            studies_info.append(study_entry)

    return remove_duplicate_studies(studies_info)




[docs]
def get_studies_for_coordinate(
    datasets: dict[str, Dataset] | Dataset,
    coord: list[float] | tuple[float, float, float],
    radius: float = 0,
    email: str | None = None,
    sources: list[str] | None = None,
) -> list[dict[str, Any]]:
    """Find studies reporting an MNI coordinate across all datasets.

    This is a thin wrapper around :func:`search_studies` that searches every
    dataset in ``datasets``. When a single deduplicated dataset is supplied, it
    is treated as a combined source.

    Parameters
    ----------
    datasets : Union[Dict[str, Dataset], Dataset]
        NiMARE ``Dataset`` objects keyed by source name, or a single
        deduplicated ``Dataset`` instance.
    coord : Union[List[float], Tuple[float, float, float]]
        MNI coordinate ``[x, y, z]``.
    radius : float, default=0
        Search radius in millimeters around the coordinate. ``0`` indicates an
        exact match.
    email : Optional[str], optional
        Email address for Entrez (if abstract fetching is enabled).
    sources : Optional[List[str]], optional
        Restrict the search to the specified dataset names when ``datasets`` is
        a mapping.
    """
    dataset_map: dict[str, Dataset]
    if isinstance(datasets, dict):
        dataset_map = datasets
    else:
        dataset_map = {"Combined": datasets}

    return search_studies(
        dataset_map, coord, radius=radius, email=email, sources=sources
    )