finemo.main
Main CLI module for the Fi-NeMo motif instance calling pipeline.
This module provides the command-line interface for all Fi-NeMo operations:
- Data preprocessing from various genomic formats
- Motif hit calling using the Fi-NeMo algorithm
- Report generation and result visualization
- Post-processing operations (hit collapsing, intersection)
The CLI supports multiple input formats including bigWig, HDF5 (ChromBPNet/BPNet), and TF-MoDISco format.
1"""Main CLI module for the Fi-NeMo motif instance calling pipeline. 2 3This module provides the command-line interface for all Fi-NeMo operations: 4- Data preprocessing from various genomic formats 5- Motif hit calling using the Fi-NeMo algorithm 6- Report generation and result visualization 7- Post-processing operations (hit collapsing, intersection) 8 9The CLI supports multiple input formats including bigWig, HDF5 (ChromBPNet/BPNet), 10and TF-MoDISco format. 11""" 12 13from . import data_io 14 15import os 16import argparse 17import warnings 18import inspect 19from typing import Optional, List 20 21import polars as pl 22 23 24def extract_regions_bw( 25 peaks_path: str, 26 chrom_order_path: Optional[str], 27 fa_path: str, 28 bw_paths: List[str], 29 out_path: str, 30 region_width: int = 1000, 31) -> None: 32 """Extract genomic regions and contribution scores from bigWig and FASTA files. 33 34 Parameters 35 ---------- 36 peaks_path : str 37 Path to ENCODE NarrowPeak format file. 38 chrom_order_path : str, optional 39 Path to chromosome ordering file. 40 fa_path : str 41 Path to genome FASTA file. 42 bw_paths : List[str] 43 List of bigWig file paths containing contribution scores. 44 out_path : str 45 Output path for NPZ file. 46 region_width : int, default 1000 47 Width of regions to extract around peak summits. 48 49 Notes 50 ----- 51 BigWig files only provide projected contribution scores. 52 """ 53 half_width = region_width // 2 54 55 # Load peak regions and extract sequences/contributions 56 peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width) 57 sequences, contribs = data_io.load_regions_from_bw( 58 peaks_df, fa_path, bw_paths, half_width 59 ) 60 61 # Save processed data to NPZ format 62 data_io.write_regions_npz(sequences, contribs, out_path, peaks_df=peaks_df) 63 64 65def extract_regions_chrombpnet_h5( 66 peaks_path: Optional[str], 67 chrom_order_path: Optional[str], 68 h5_paths: List[str], 69 out_path: str, 70 region_width: int = 1000, 71) -> None: 72 """Extract genomic regions and contribution scores from ChromBPNet HDF5 files. 73 74 Parameters 75 ---------- 76 peaks_path : str, optional 77 Path to ENCODE NarrowPeak format file. If None, lacks absolute coordinates. 78 chrom_order_path : str, optional 79 Path to chromosome ordering file. 80 h5_paths : List[str] 81 List of ChromBPNet HDF5 file paths. 82 out_path : str 83 Output path for NPZ file. 84 region_width : int, default 1000 85 Width of regions to extract around peak summits. 86 """ 87 half_width = region_width // 2 88 89 if peaks_path is not None: 90 peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width) 91 else: 92 peaks_df = None 93 94 sequences, contribs = data_io.load_regions_from_chrombpnet_h5(h5_paths, half_width) 95 96 data_io.write_regions_npz(sequences, contribs, out_path, peaks_df=peaks_df) 97 98 99def extract_regions_bpnet_h5( 100 peaks_path: Optional[str], 101 chrom_order_path: Optional[str], 102 h5_paths: List[str], 103 out_path: str, 104 region_width: int = 1000, 105) -> None: 106 """Extract genomic regions and contribution scores from BPNet HDF5 files. 107 108 Parameters 109 ---------- 110 peaks_path : str, optional 111 Path to ENCODE NarrowPeak format file. If None, output lacks absolute coordinates. 112 chrom_order_path : str, optional 113 Path to chromosome ordering file. 114 h5_paths : List[str] 115 List of BPNet HDF5 file paths. 116 out_path : str 117 Output path for NPZ file. 118 region_width : int, default 1000 119 Width of regions to extract around peak summits. 120 """ 121 half_width = region_width // 2 122 123 if peaks_path is not None: 124 peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width) 125 else: 126 peaks_df = None 127 128 sequences, contribs = data_io.load_regions_from_bpnet_h5(h5_paths, half_width) 129 130 data_io.write_regions_npz(sequences, contribs, out_path, peaks_df=peaks_df) 131 132 133def extract_regions_modisco_fmt( 134 peaks_path: Optional[str], 135 chrom_order_path: Optional[str], 136 shaps_paths: List[str], 137 ohe_path: str, 138 out_path: str, 139 region_width: int = 1000, 140) -> None: 141 """Extract genomic regions and contribution scores from TF-MoDISco format files. 142 143 Parameters 144 ---------- 145 peaks_path : str, optional 146 Path to ENCODE NarrowPeak format file. If None, output lacks absolute coordinates. 147 chrom_order_path : str, optional 148 Path to chromosome ordering file. 149 shaps_paths : List[str] 150 List of paths to .npy/.npz files containing SHAP/attribution scores. 151 ohe_path : str 152 Path to .npy/.npz file containing one-hot encoded sequences. 153 out_path : str 154 Output path for NPZ file. 155 region_width : int, default 1000 156 Width of regions to extract around peak summits. 157 """ 158 half_width = region_width // 2 159 160 if peaks_path is not None: 161 peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width) 162 else: 163 peaks_df = None 164 165 sequences, contribs = data_io.load_regions_from_modisco_fmt( 166 shaps_paths, ohe_path, half_width 167 ) 168 169 data_io.write_regions_npz(sequences, contribs, out_path, peaks_df=peaks_df) 170 171 172def call_hits( 173 regions_path: str, 174 peaks_path: Optional[str], 175 modisco_h5_path: str, 176 chrom_order_path: Optional[str], 177 motifs_include_path: Optional[str], 178 motif_names_path: Optional[str], 179 motif_lambdas_path: Optional[str], 180 out_dir: str, 181 cwm_trim_coords_path: Optional[str] = None, 182 cwm_trim_thresholds_path: Optional[str] = None, 183 cwm_trim_threshold_default: float = 0.3, 184 lambda_default: float = 0.7, 185 step_size_max: float = 3.0, 186 step_size_min: float = 0.08, 187 sqrt_transform: bool = False, 188 convergence_tol: float = 0.0005, 189 max_steps: int = 10000, 190 batch_size: int = 2000, 191 step_adjust: float = 0.7, 192 device: Optional[str] = None, 193 mode: str = "pp", 194 no_post_filter: bool = False, 195 compile_optimizer: bool = False, 196) -> None: 197 """Call motif hits using the Fi-NeMo algorithm on preprocessed genomic regions. 198 199 This function implements the core Fi-NeMo hit calling pipeline, which identifies 200 motif instances by solving a sparse reconstruction problem using proximal gradient 201 descent. The algorithm represents contribution scores as weighted combinations of 202 motif CWMs at specific positions. 203 204 Parameters 205 ---------- 206 regions_path : str 207 Path to NPZ file containing preprocessed regions (sequences, contributions, 208 and optional peak coordinates). 209 peaks_path : str, optional 210 DEPRECATED. Path to ENCODE NarrowPeak format file. Peak data should be 211 included during preprocessing instead. 212 modisco_h5_path : str 213 Path to TF-MoDISco H5 file containing motif CWMs. 214 chrom_order_path : str, optional 215 DEPRECATED. Path to chromosome ordering file. 216 motifs_include_path : str, optional 217 Path to file listing motif names to include in analysis. 218 motif_names_path : str, optional 219 Path to file mapping motif IDs to custom names. 220 motif_lambdas_path : str, optional 221 Path to file specifying per-motif lambda values. 222 out_dir : str 223 Output directory for results. 224 cwm_trim_coords_path : str, optional 225 Path to file specifying custom motif trimming coordinates. 226 cwm_trim_thresholds_path : str, optional 227 Path to file specifying custom motif trimming thresholds. 228 cwm_trim_threshold_default : float, default 0.3 229 Default threshold for motif trimming. 230 lambda_default : float, default 0.7 231 Default L1 regularization weight. 232 step_size_max : float, default 3.0 233 Maximum optimization step size. 234 step_size_min : float, default 0.08 235 Minimum optimization step size. 236 sqrt_transform : bool, default False 237 Whether to apply signed square root transform to contributions. 238 convergence_tol : float, default 0.0005 239 Convergence tolerance for duality gap. 240 max_steps : int, default 10000 241 Maximum number of optimization steps. 242 batch_size : int, default 2000 243 Batch size for GPU processing. 244 step_adjust : float, default 0.7 245 Step size adjustment factor on divergence. 246 device : str, optional 247 DEPRECATED. Use CUDA_VISIBLE_DEVICES environment variable instead. 248 mode : str, default "pp" 249 Contribution type mode ('pp', 'ph', 'hp', 'hh') where 'p'=projected, 'h'=hypothetical. 250 no_post_filter : bool, default False 251 If True, skip post-hit-calling similarity filtering. 252 compile_optimizer : bool, default False 253 Whether to JIT-compile the optimizer for speed. 254 255 Notes 256 ----- 257 The Fi-NeMo algorithm solves the optimization problem: 258 minimize_c: ||contribs - reconstruction(c)||²₂ + λ||c||₁ 259 subject to: c ≥ 0 260 261 where c represents motif hit coefficients and reconstruction uses convolution 262 with motif CWMs. 263 """ 264 265 params = locals() 266 import torch 267 from . import hitcaller 268 269 if device is not None: 270 warnings.warn( 271 "The `--device` flag is deprecated and will be removed in a future version. Please use the `CUDA_VISIBLE_DEVICES` environment variable to specify the GPU device." 272 ) 273 274 sequences, contribs, peaks_df, has_peaks = data_io.load_regions_npz(regions_path) 275 276 region_width = sequences.shape[2] 277 if region_width % 2 != 0: 278 raise ValueError(f"Region width of {region_width} is not divisible by 2.") 279 280 half_width = region_width // 2 281 num_regions = contribs.shape[0] 282 283 if peaks_path is not None: 284 warnings.warn( 285 "Providing a peaks file to `call-hits` is deprecated, and this option will be removed in a future version. Peaks should instead be provided in the preprocessing step to be included in `regions.npz`." 286 ) 287 peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width) 288 has_peaks = True 289 290 if not has_peaks: 291 warnings.warn( 292 "No peak region data provided. Output hits will lack absolute genomic coordinates." 293 ) 294 295 if mode == "pp": 296 motif_type = "cwm" 297 use_hypothetical_contribs = False 298 elif mode == "ph": 299 motif_type = "cwm" 300 use_hypothetical_contribs = True 301 elif mode == "hp": 302 motif_type = "hcwm" 303 use_hypothetical_contribs = False 304 elif mode == "hh": 305 motif_type = "hcwm" 306 use_hypothetical_contribs = True 307 else: 308 raise ValueError( 309 f"Invalid mode: {mode}. Must be one of 'pp', 'ph', 'hp', 'hh'." 310 ) 311 312 if motifs_include_path is not None: 313 motifs_include = data_io.load_txt(motifs_include_path) 314 else: 315 motifs_include = None 316 317 if motif_names_path is not None: 318 motif_name_map = data_io.load_mapping(motif_names_path, str) 319 else: 320 motif_name_map = None 321 322 if motif_lambdas_path is not None: 323 motif_lambdas = data_io.load_mapping(motif_lambdas_path, float) 324 else: 325 motif_lambdas = None 326 327 if cwm_trim_coords_path is not None: 328 trim_coords = data_io.load_mapping_tuple(cwm_trim_coords_path, int) 329 else: 330 trim_coords = None 331 332 if cwm_trim_thresholds_path is not None: 333 trim_thresholds = data_io.load_mapping(cwm_trim_thresholds_path, float) 334 else: 335 trim_thresholds = None 336 337 motifs_df, cwms, trim_masks, _ = data_io.load_modisco_motifs( 338 modisco_h5_path, 339 trim_coords, 340 trim_thresholds, 341 cwm_trim_threshold_default, 342 motif_type, 343 motifs_include, 344 motif_name_map, 345 motif_lambdas, 346 lambda_default, 347 True, 348 ) 349 num_motifs = cwms.shape[0] 350 motif_width = cwms.shape[2] 351 lambdas = motifs_df.get_column("lambda").to_numpy(writable=True) 352 353 device_obj = torch.device(device) if device is not None else None 354 hits_df, qc_df = hitcaller.fit_contribs( 355 cwms, 356 contribs, 357 sequences, 358 trim_masks, 359 use_hypothetical_contribs, 360 lambdas, 361 step_size_max, 362 step_size_min, 363 sqrt_transform, 364 convergence_tol, 365 max_steps, 366 batch_size, 367 step_adjust, 368 not no_post_filter, 369 device_obj, 370 compile_optimizer, 371 ) 372 373 os.makedirs(out_dir, exist_ok=True) 374 out_path_qc = os.path.join(out_dir, "peaks_qc.tsv") 375 data_io.write_hits(hits_df, peaks_df, motifs_df, qc_df, out_dir, motif_width) 376 data_io.write_qc(qc_df, peaks_df, out_path_qc) 377 378 out_path_motif_df = os.path.join(out_dir, "motif_data.tsv") 379 data_io.write_motifs_df(motifs_df, out_path_motif_df) 380 381 out_path_motif_cwms = os.path.join(out_dir, "motif_cwms.npy") 382 data_io.write_motif_cwms(cwms, out_path_motif_cwms) 383 384 params |= { 385 "region_width": region_width, 386 "num_regions": num_regions, 387 "untrimmed_motif_width": motif_width, 388 "num_motifs": num_motifs, 389 } 390 391 out_path_params = os.path.join(out_dir, "parameters.json") 392 data_io.write_params(params, out_path_params) 393 394 395def report( 396 regions_path: str, 397 hits_dir: str, 398 modisco_h5_path: Optional[str], 399 peaks_path: Optional[str], 400 motifs_include_path: Optional[str], 401 motif_names_path: Optional[str], 402 out_dir: str, 403 modisco_region_width: int = 400, 404 cwm_trim_threshold: float = 0.3, 405 compute_recall: bool = True, 406 use_seqlets: bool = True, 407) -> None: 408 """Generate comprehensive HTML report with statistics and visualizations. 409 410 This function creates detailed analysis reports comparing Fi-NeMo hit calling 411 results with TF-MoDISco seqlets, including performance metrics, distribution 412 plots, and motif visualization. The report provides insights into hit calling 413 quality and motif discovery accuracy. 414 415 Parameters 416 ---------- 417 regions_path : str 418 Path to NPZ file containing the same regions used for hit calling. 419 hits_dir : str 420 Path to directory containing Fi-NeMo hit calling outputs. 421 modisco_h5_path : str, optional 422 Path to TF-MoDISco H5 file. If None, seqlet comparisons are skipped. 423 peaks_path : str, optional 424 DEPRECATED. Peak coordinates should be included in regions file. 425 motifs_include_path : str, optional 426 DEPRECATED. This information is inferred from hit calling outputs. 427 motif_names_path : str, optional 428 DEPRECATED. This information is inferred from hit calling outputs. 429 out_dir : str 430 Output directory for report files. 431 modisco_region_width : int, default 400 432 Width of regions used by TF-MoDISco (needed for coordinate conversion). 433 cwm_trim_threshold : float, default 0.3 434 DEPRECATED. This information is inferred from hit calling outputs. 435 compute_recall : bool, default True 436 Whether to compute recall metrics against TF-MoDISco seqlets. 437 use_seqlets : bool, default True 438 Whether to include seqlet-based comparisons in the report. 439 440 Notes 441 ----- 442 The generated report includes: 443 - Hit vs seqlet count comparisons 444 - Motif CWM visualizations 445 - Hit statistic distributions 446 - Co-occurrence heatmaps 447 - Confusion matrices for overlapping motifs 448 """ 449 from . import evaluation, visualization 450 451 sequences, contribs, peaks_df, _ = data_io.load_regions_npz(regions_path) 452 if len(contribs.shape) == 3: 453 regions = contribs * sequences 454 elif len(contribs.shape) == 2: 455 regions = contribs[:, None, :] * sequences 456 else: 457 raise ValueError(f"Unexpected contribs shape: {contribs.shape}") 458 459 half_width = regions.shape[2] // 2 460 modisco_half_width = modisco_region_width // 2 461 462 if peaks_path is not None: 463 warnings.warn( 464 "Providing a peaks file to `report` is deprecated, and this option will be removed in a future version. Peaks should instead be provided in the preprocessing step to be included in `regions.npz`." 465 ) 466 peaks_df = data_io.load_peaks(peaks_path, None, half_width) 467 468 if hits_dir.endswith(".tsv"): 469 warnings.warn( 470 "Passing a hits.tsv file to `finemo report` is deprecated. Please provide the directory containing the hits.tsv file instead." 471 ) 472 473 hits_path = hits_dir 474 475 hits_df = data_io.load_hits(hits_path, lazy=True) 476 477 if motifs_include_path is not None: 478 motifs_include = data_io.load_txt(motifs_include_path) 479 else: 480 motifs_include = None 481 482 if motif_names_path is not None: 483 motif_name_map = data_io.load_mapping(motif_names_path, str) 484 else: 485 motif_name_map = None 486 487 if modisco_h5_path is not None: 488 motifs_df, cwms_modisco, _, motif_names = data_io.load_modisco_motifs( 489 modisco_h5_path, 490 None, 491 None, 492 cwm_trim_threshold, 493 "cwm", 494 motifs_include, 495 motif_name_map, 496 None, 497 1.0, 498 True, 499 ) 500 else: 501 # When no modisco_h5_path is provided in legacy TSV mode, we can't compute motifs 502 # This will cause an error later, but that's expected behavior 503 raise ValueError( 504 "modisco_h5_path is required when providing a hits.tsv file directly" 505 ) 506 507 else: 508 hits_df_path = os.path.join(hits_dir, "hits.tsv") 509 hits_df = data_io.load_hits(hits_df_path, lazy=True) 510 511 motifs_df_path = os.path.join(hits_dir, "motif_data.tsv") 512 motifs_df, motif_names = data_io.load_motifs_df(motifs_df_path) 513 514 cwms_modisco_path = os.path.join(hits_dir, "motif_cwms.npy") 515 cwms_modisco = data_io.load_motif_cwms(cwms_modisco_path) 516 517 params_path = os.path.join(hits_dir, "parameters.json") 518 params = data_io.load_params(params_path) 519 cwm_trim_threshold = params["cwm_trim_threshold_default"] 520 521 if not use_seqlets: 522 warnings.warn( 523 "Usage of the `--no-seqlets` flag is deprecated and will be removed in a future version. Please omit the `--modisco-h5` argument instead." 524 ) 525 seqlets_df = None 526 elif modisco_h5_path is None: 527 compute_recall = False 528 seqlets_df = None 529 else: 530 seqlets_df = data_io.load_modisco_seqlets( 531 modisco_h5_path, 532 peaks_df, 533 motifs_df, 534 half_width, 535 modisco_half_width, 536 lazy=True, 537 ) 538 539 motif_width = cwms_modisco.shape[2] 540 541 # Convert to LazyFrame if needed and ensure motif_names is a list 542 if isinstance(hits_df, pl.LazyFrame): 543 hits_df_lazy: pl.LazyFrame = hits_df 544 else: 545 hits_df_lazy: pl.LazyFrame = hits_df.lazy() 546 547 motif_names_list: List[str] = list(motif_names) 548 549 occ_df, coooc = evaluation.get_motif_occurences(hits_df_lazy, motif_names_list) 550 551 report_data, report_df, cwms, trim_bounds = evaluation.tfmodisco_comparison( 552 regions, 553 hits_df, 554 peaks_df, 555 seqlets_df, 556 motifs_df, 557 cwms_modisco, 558 motif_names_list, 559 modisco_half_width, 560 motif_width, 561 compute_recall, 562 ) 563 564 if seqlets_df is not None: 565 confusion_df, confusion_mat = evaluation.seqlet_confusion( 566 hits_df, seqlets_df, peaks_df, motif_names_list, motif_width 567 ) 568 else: 569 confusion_df, confusion_mat = None, None 570 571 os.makedirs(out_dir, exist_ok=True) 572 573 occ_path = os.path.join(out_dir, "motif_occurrences.tsv") 574 data_io.write_occ_df(occ_df, occ_path) 575 576 data_io.write_report_data(report_df, cwms, out_dir) 577 578 visualization.plot_hit_stat_distributions(hits_df_lazy, motif_names_list, out_dir) 579 visualization.plot_hit_peak_distributions(occ_df, motif_names_list, out_dir) 580 visualization.plot_peak_motif_indicator_heatmap(coooc, motif_names_list, out_dir) 581 582 plot_dir = os.path.join(out_dir, "CWMs") 583 visualization.plot_cwms(cwms, trim_bounds, plot_dir) 584 585 if seqlets_df is not None: 586 seqlets_collected = ( 587 seqlets_df.collect() if isinstance(seqlets_df, pl.LazyFrame) else seqlets_df 588 ) 589 seqlets_path = os.path.join(out_dir, "seqlets.tsv") 590 data_io.write_modisco_seqlets(seqlets_collected, seqlets_path) 591 592 if confusion_df is not None and confusion_mat is not None: 593 seqlet_confusion_path = os.path.join(out_dir, "seqlet_confusion.tsv") 594 data_io.write_seqlet_confusion_df(confusion_df, seqlet_confusion_path) 595 596 visualization.plot_hit_vs_seqlet_counts(report_data, out_dir) 597 visualization.plot_seqlet_confusion_heatmap( 598 confusion_mat, motif_names_list, out_dir 599 ) 600 601 report_path = os.path.join(out_dir, "report.html") 602 visualization.write_report( 603 report_df, motif_names_list, report_path, compute_recall, seqlets_df is not None 604 ) 605 606 607def collapse_hits(hits_path: str, out_path: str, overlap_frac: float = 0.2) -> None: 608 """Collapse overlapping hits by selecting the best hit per overlapping group. 609 610 This function processes a set of motif hits and identifies overlapping hits, 611 keeping only the hit with the highest similarity score within each overlapping 612 group. This reduces redundancy in hit calls while preserving the most confident 613 predictions. 614 615 Parameters 616 ---------- 617 hits_path : str 618 Path to input TSV file containing hit data (hits.tsv or hits_unique.tsv). 619 out_path : str 620 Path to output TSV file with additional 'is_primary' column. 621 overlap_frac : float, default 0.2 622 Minimum fractional overlap for considering hits as overlapping. 623 For hits of lengths x and y, minimum overlap = overlap_frac * (x + y) / 2. 624 625 Notes 626 ----- 627 The algorithm uses a sweep line approach with a heap data structure to 628 efficiently identify overlapping intervals and select the best hit based 629 on similarity scores. 630 """ 631 from . import postprocessing 632 633 hits_df = data_io.load_hits(hits_path, lazy=False) 634 hits_collapsed_df = postprocessing.collapse_hits(hits_df, overlap_frac) 635 636 data_io.write_hits_processed( 637 hits_collapsed_df, out_path, schema=data_io.HITS_COLLAPSED_DTYPES 638 ) 639 640 641def intersect_hits(hits_paths: List[str], out_path: str, relaxed: bool = False) -> None: 642 """Find intersection of hits across multiple Fi-NeMo runs. 643 644 This function identifies motif hits that are consistently called across 645 multiple independent runs, providing a way to assess reproducibility and 646 identify high-confidence hits. 647 648 Parameters 649 ---------- 650 hits_paths : List[str] 651 List of paths to input TSV files from different runs. 652 out_path : str 653 Path to output TSV file containing intersection results. 654 Duplicate columns are suffixed with run index. 655 relaxed : bool, default False 656 If True, uses relaxed intersection criteria based only on motif names 657 and untrimmed coordinates. If False, assumes consistent region definitions 658 and motif trimming across runs. 659 660 Notes 661 ----- 662 The strict intersection mode requires consistent input regions and motif 663 processing parameters across all runs. The relaxed mode is more permissive 664 but may not be suitable when genomic coordinates are unavailable. 665 """ 666 from . import postprocessing 667 668 hits_dfs = [data_io.load_hits(hits_path, lazy=False) for hits_path in hits_paths] 669 hits_df = postprocessing.intersect_hits(hits_dfs, relaxed) 670 671 data_io.write_hits_processed(hits_df, out_path, schema=None) 672 673 674def cli() -> None: 675 """Command-line interface for the Fi-NeMo motif instance calling pipeline. 676 677 This function provides the main entry point for all Fi-NeMo operations including: 678 - Data preprocessing from various formats (bigWig, HDF5, TF-MoDISco) 679 - Motif hit calling using the Fi-NeMo algorithm 680 - Report generation and visualization 681 - Post-processing operations (hit collapsing, intersection) 682 """ 683 parser = argparse.ArgumentParser() 684 subparsers = parser.add_subparsers(required=True, dest="cmd") 685 686 extract_regions_bw_parser = subparsers.add_parser( 687 "extract-regions-bw", 688 formatter_class=argparse.ArgumentDefaultsHelpFormatter, 689 help="Extract sequences and contributions from FASTA and bigwig files.", 690 ) 691 692 extract_regions_bw_parser.add_argument( 693 "-p", 694 "--peaks", 695 type=str, 696 required=True, 697 help="A peak regions file in ENCODE NarrowPeak format.", 698 ) 699 extract_regions_bw_parser.add_argument( 700 "-C", 701 "--chrom-order", 702 type=str, 703 default=None, 704 help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.", 705 ) 706 extract_regions_bw_parser.add_argument( 707 "-f", 708 "--fasta", 709 type=str, 710 required=True, 711 help="A genome FASTA file. If an .fai index file doesn't exist in the same directory, it will be created.", 712 ) 713 extract_regions_bw_parser.add_argument( 714 "-b", 715 "--bigwigs", 716 type=str, 717 required=True, 718 nargs="+", 719 help="One or more bigwig files of contribution scores, with paths delimited by whitespace. Scores are averaged across files.", 720 ) 721 722 extract_regions_bw_parser.add_argument( 723 "-o", 724 "--out-path", 725 type=str, 726 required=True, 727 help="The path to the output .npz file.", 728 ) 729 730 extract_regions_bw_parser.add_argument( 731 "-w", 732 "--region-width", 733 type=int, 734 default=inspect.signature(extract_regions_bw) 735 .parameters["region_width"] 736 .default, 737 help="The width of the input region centered around each peak summit.", 738 ) 739 740 extract_chrombpnet_regions_h5_parser = subparsers.add_parser( 741 "extract-regions-chrombpnet-h5", 742 formatter_class=argparse.ArgumentDefaultsHelpFormatter, 743 help="Extract sequences and contributions from ChromBPNet contributions H5 files.", 744 ) 745 746 extract_chrombpnet_regions_h5_parser.add_argument( 747 "-p", 748 "--peaks", 749 type=str, 750 default=None, 751 help="A peak regions file in ENCODE NarrowPeak format. If omitted, downstream outputs will lack absolute genomic coordinates.", 752 ) 753 extract_chrombpnet_regions_h5_parser.add_argument( 754 "-C", 755 "--chrom-order", 756 type=str, 757 default=None, 758 help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.", 759 ) 760 761 extract_chrombpnet_regions_h5_parser.add_argument( 762 "-c", 763 "--h5s", 764 type=str, 765 required=True, 766 nargs="+", 767 help="One or more H5 files of contribution scores, with paths delimited by whitespace. Scores are averaged across files.", 768 ) 769 770 extract_chrombpnet_regions_h5_parser.add_argument( 771 "-o", 772 "--out-path", 773 type=str, 774 required=True, 775 help="The path to the output .npz file.", 776 ) 777 778 extract_chrombpnet_regions_h5_parser.add_argument( 779 "-w", 780 "--region-width", 781 type=int, 782 default=inspect.signature(extract_regions_chrombpnet_h5) 783 .parameters["region_width"] 784 .default, 785 help="The width of the input region centered around each peak summit.", 786 ) 787 788 extract_regions_h5_parser = subparsers.add_parser( 789 "extract-regions-h5", 790 formatter_class=argparse.ArgumentDefaultsHelpFormatter, 791 help="Extract sequences and contributions from ChromBPNet contributions H5 files. DEPRECATED: Use `extract-regions-chrombpnet-h5` instead.", 792 ) 793 794 extract_regions_h5_parser.add_argument( 795 "-p", 796 "--peaks", 797 type=str, 798 default=None, 799 help="A peak regions file in ENCODE NarrowPeak format. If omitted, downstream outputs will lack absolute genomic coordinates.", 800 ) 801 extract_regions_h5_parser.add_argument( 802 "-C", 803 "--chrom-order", 804 type=str, 805 default=None, 806 help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.", 807 ) 808 809 extract_regions_h5_parser.add_argument( 810 "-c", 811 "--h5s", 812 type=str, 813 required=True, 814 nargs="+", 815 help="One or more H5 files of contribution scores, with paths delimited by whitespace. Scores are averaged across files.", 816 ) 817 818 extract_regions_h5_parser.add_argument( 819 "-o", 820 "--out-path", 821 type=str, 822 required=True, 823 help="The path to the output .npz file.", 824 ) 825 826 extract_regions_h5_parser.add_argument( 827 "-w", 828 "--region-width", 829 type=int, 830 default=inspect.signature(extract_regions_chrombpnet_h5) 831 .parameters["region_width"] 832 .default, 833 help="The width of the input region centered around each peak summit.", 834 ) 835 836 extract_bpnet_regions_h5_parser = subparsers.add_parser( 837 "extract-regions-bpnet-h5", 838 formatter_class=argparse.ArgumentDefaultsHelpFormatter, 839 help="Extract sequences and contributions from BPNet contributions H5 files.", 840 ) 841 842 extract_bpnet_regions_h5_parser.add_argument( 843 "-p", 844 "--peaks", 845 type=str, 846 default=None, 847 help="A peak regions file in ENCODE NarrowPeak format. If omitted, downstream outputs will lack absolute genomic coordinates.", 848 ) 849 extract_bpnet_regions_h5_parser.add_argument( 850 "-C", 851 "--chrom-order", 852 type=str, 853 default=None, 854 help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.", 855 ) 856 857 extract_bpnet_regions_h5_parser.add_argument( 858 "-c", 859 "--h5s", 860 type=str, 861 required=True, 862 nargs="+", 863 help="One or more H5 files of contribution scores, with paths delimited by whitespace. Scores are averaged across files.", 864 ) 865 866 extract_bpnet_regions_h5_parser.add_argument( 867 "-o", 868 "--out-path", 869 type=str, 870 required=True, 871 help="The path to the output .npz file.", 872 ) 873 874 extract_bpnet_regions_h5_parser.add_argument( 875 "-w", 876 "--region-width", 877 type=int, 878 default=inspect.signature(extract_regions_bpnet_h5) 879 .parameters["region_width"] 880 .default, 881 help="The width of the input region centered around each peak summit.", 882 ) 883 884 extract_regions_modisco_fmt_parser = subparsers.add_parser( 885 "extract-regions-modisco-fmt", 886 formatter_class=argparse.ArgumentDefaultsHelpFormatter, 887 help="Extract sequences and contributions from tfmodisco-lite input files.", 888 ) 889 890 extract_regions_modisco_fmt_parser.add_argument( 891 "-p", 892 "--peaks", 893 type=str, 894 default=None, 895 help="A peak regions file in ENCODE NarrowPeak format. If omitted, downstream outputs will lack absolute genomic coordinates.", 896 ) 897 extract_regions_modisco_fmt_parser.add_argument( 898 "-C", 899 "--chrom-order", 900 type=str, 901 default=None, 902 help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.", 903 ) 904 905 extract_regions_modisco_fmt_parser.add_argument( 906 "-s", 907 "--sequences", 908 type=str, 909 required=True, 910 help="A .npy or .npz file containing one-hot encoded sequences.", 911 ) 912 913 extract_regions_modisco_fmt_parser.add_argument( 914 "-a", 915 "--attributions", 916 type=str, 917 required=True, 918 nargs="+", 919 help="One or more .npy or .npz files of hypothetical contribution scores, with paths delimited by whitespace. Scores are averaged across files.", 920 ) 921 922 extract_regions_modisco_fmt_parser.add_argument( 923 "-o", 924 "--out-path", 925 type=str, 926 required=True, 927 help="The path to the output .npz file.", 928 ) 929 930 extract_regions_modisco_fmt_parser.add_argument( 931 "-w", 932 "--region-width", 933 type=int, 934 default=inspect.signature(extract_regions_modisco_fmt) 935 .parameters["region_width"] 936 .default, 937 help="The width of the input region centered around each peak summit.", 938 ) 939 940 call_hits_parser = subparsers.add_parser( 941 "call-hits", 942 formatter_class=argparse.ArgumentDefaultsHelpFormatter, 943 help="Call hits on provided sequences, contributions, and motif CWM's.", 944 ) 945 946 call_hits_parser.add_argument( 947 "-M", 948 "--mode", 949 type=str, 950 default=inspect.signature(call_hits).parameters["mode"].default, 951 choices={"pp", "ph", "hp", "hh"}, 952 help="The type of attributions to use for CWM's and input contribution scores, respectively. 'h' for hypothetical and 'p' for projected.", 953 ) 954 955 call_hits_parser.add_argument( 956 "-r", 957 "--regions", 958 type=str, 959 required=True, 960 help="A .npz file of input sequences, contributions, and coordinates. Can be generated using `finemo extract-regions-*` subcommands.", 961 ) 962 call_hits_parser.add_argument( 963 "-m", 964 "--modisco-h5", 965 type=str, 966 required=True, 967 help="A tfmodisco-lite output H5 file of motif patterns.", 968 ) 969 970 call_hits_parser.add_argument( 971 "-p", 972 "--peaks", 973 type=str, 974 default=None, 975 help="DEPRECATED: Please provide this file to a preprocessing `finemo extract-regions-*` subcommand instead.", 976 ) 977 call_hits_parser.add_argument( 978 "-C", 979 "--chrom-order", 980 type=str, 981 default=None, 982 help="DEPRECATED: Please provide this file to a preprocessing `finemo extract-regions-*` subcommand instead.", 983 ) 984 985 call_hits_parser.add_argument( 986 "-I", 987 "--motifs-include", 988 type=str, 989 default=None, 990 help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column to include in hit calling. If omitted, all motifs in the modisco H5 file are used.", 991 ) 992 call_hits_parser.add_argument( 993 "-N", 994 "--motif-names", 995 type=str, 996 default=None, 997 help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column and custom names in the second column. Omitted motifs default to tfmodisco names.", 998 ) 999 1000 call_hits_parser.add_argument( 1001 "-o", 1002 "--out-dir", 1003 type=str, 1004 required=True, 1005 help="The path to the output directory.", 1006 ) 1007 1008 call_hits_parser.add_argument( 1009 "-t", 1010 "--cwm-trim-threshold", 1011 type=float, 1012 default=inspect.signature(call_hits) 1013 .parameters["cwm_trim_threshold_default"] 1014 .default, 1015 help="The default threshold to determine motif start and end positions within the full CWMs.", 1016 ) 1017 call_hits_parser.add_argument( 1018 "-T", 1019 "--cwm-trim-thresholds", 1020 type=str, 1021 default=None, 1022 help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column and custom trim thresholds in the second column. Omitted motifs default to the `--cwm-trim-threshold` value.", 1023 ) 1024 call_hits_parser.add_argument( 1025 "-R", 1026 "--cwm-trim-coords", 1027 type=str, 1028 default=None, 1029 help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column and custom trim start and end coordinates in the second and third columns, respectively. Omitted motifs default to `--cwm-trim-thresholds` values.", 1030 ) 1031 1032 call_hits_parser.add_argument( 1033 "-l", 1034 "--global-lambda", 1035 type=float, 1036 default=inspect.signature(call_hits).parameters["lambda_default"].default, 1037 help="The default L1 regularization weight determining the sparsity of hits.", 1038 ) 1039 call_hits_parser.add_argument( 1040 "-L", 1041 "--motif-lambdas", 1042 type=str, 1043 default=None, 1044 help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column and motif-specific lambdas in the second column. Omitted motifs default to the `--global-lambda` value.", 1045 ) 1046 call_hits_parser.add_argument( 1047 "-a", 1048 "--alpha", 1049 type=float, 1050 default=None, 1051 help="DEPRECATED: Please use the `--lambda` argument instead.", 1052 ) 1053 call_hits_parser.add_argument( 1054 "-A", 1055 "--motif-alphas", 1056 type=str, 1057 default=None, 1058 help="DEPRECATED: Please use the `--motif-lambdas` argument instead.", 1059 ) 1060 1061 call_hits_parser.add_argument( 1062 "-f", 1063 "--no-post-filter", 1064 action="store_true", 1065 help="Do not perform post-hit-calling filtering. By default, hits are filtered based on a minimum cosine similarity of `lambda` with the input contributions.", 1066 ) 1067 call_hits_parser.add_argument( 1068 "-q", 1069 "--sqrt-transform", 1070 action="store_true", 1071 help="Apply a signed square root transform to the input contributions and CWMs before hit calling.", 1072 ) 1073 call_hits_parser.add_argument( 1074 "-s", 1075 "--step-size-max", 1076 type=float, 1077 default=inspect.signature(call_hits).parameters["step_size_max"].default, 1078 help="The maximum optimizer step size.", 1079 ) 1080 call_hits_parser.add_argument( 1081 "-i", 1082 "--step-size-min", 1083 type=float, 1084 default=inspect.signature(call_hits).parameters["step_size_min"].default, 1085 help="The minimum optimizer step size.", 1086 ) 1087 call_hits_parser.add_argument( 1088 "-j", 1089 "--step-adjust", 1090 type=float, 1091 default=inspect.signature(call_hits).parameters["step_adjust"].default, 1092 help="The optimizer step size adjustment factor. If the optimizer diverges, the step size is multiplicatively adjusted by this factor", 1093 ) 1094 call_hits_parser.add_argument( 1095 "-c", 1096 "--convergence-tol", 1097 type=float, 1098 default=inspect.signature(call_hits).parameters["convergence_tol"].default, 1099 help="The tolerance for determining convergence. The optimizer exits when the duality gap is less than the tolerance.", 1100 ) 1101 call_hits_parser.add_argument( 1102 "-S", 1103 "--max-steps", 1104 type=int, 1105 default=inspect.signature(call_hits).parameters["max_steps"].default, 1106 help="The maximum number of optimization steps.", 1107 ) 1108 call_hits_parser.add_argument( 1109 "-b", 1110 "--batch-size", 1111 type=int, 1112 default=inspect.signature(call_hits).parameters["batch_size"].default, 1113 help="The batch size used for optimization.", 1114 ) 1115 call_hits_parser.add_argument( 1116 "-d", 1117 "--device", 1118 type=str, 1119 default=None, 1120 help="DEPRECATED: Please use the `CUDA_VISIBLE_DEVICES` environment variable to specify the GPU device.", 1121 ) 1122 call_hits_parser.add_argument( 1123 "-J", 1124 "--compile", 1125 action="store_true", 1126 help="JIT-compile the optimizer for faster performance. This may not be supported on older GPUs.", 1127 ) 1128 1129 report_parser = subparsers.add_parser( 1130 "report", 1131 formatter_class=argparse.ArgumentDefaultsHelpFormatter, 1132 help="Generate statistics and visualizations from hits and tfmodisco-lite motif data.", 1133 ) 1134 1135 report_parser.add_argument( 1136 "-r", 1137 "--regions", 1138 type=str, 1139 required=True, 1140 help="A .npz file containing input sequences, contributions, and coordinates. Must be the same as that used for `finemo call-hits`.", 1141 ) 1142 report_parser.add_argument( 1143 "-H", 1144 "--hits", 1145 type=str, 1146 required=True, 1147 help="The output directory generated by the `finemo call-hits` command on the regions specified in `--regions`.", 1148 ) 1149 report_parser.add_argument( 1150 "-p", 1151 "--peaks", 1152 type=str, 1153 default=None, 1154 help="DEPRECATED: Please provide this file to a preprocessing `finemo extract-regions-*` subcommand instead.", 1155 ) 1156 report_parser.add_argument( 1157 "-m", 1158 "--modisco-h5", 1159 type=str, 1160 default=None, 1161 help="The tfmodisco-lite output H5 file of motif patterns. Must be the same as that used for hit calling unless `--no-recall` is set. If omitted, seqlet-derived metrics will not be computed.", 1162 ) 1163 report_parser.add_argument( 1164 "-I", 1165 "--motifs-include", 1166 type=str, 1167 default=None, 1168 help="DEPRECATED: This information is now inferred from the outputs of `finemo call-hits`.", 1169 ) 1170 report_parser.add_argument( 1171 "-N", 1172 "--motif-names", 1173 type=str, 1174 default=None, 1175 help="DEPRECATED: This information is now inferred from the outputs of `finemo call-hits`.", 1176 ) 1177 1178 report_parser.add_argument( 1179 "-o", 1180 "--out-dir", 1181 type=str, 1182 required=True, 1183 help="The path to the report output directory.", 1184 ) 1185 1186 report_parser.add_argument( 1187 "-W", 1188 "--modisco-region-width", 1189 type=int, 1190 default=inspect.signature(report).parameters["modisco_region_width"].default, 1191 help="The width of the region around each peak summit used by tfmodisco-lite.", 1192 ) 1193 report_parser.add_argument( 1194 "-t", 1195 "--cwm-trim-threshold", 1196 type=float, 1197 default=inspect.signature(report).parameters["cwm_trim_threshold"].default, 1198 help="DEPRECATED: This information is now inferred from the outputs of `finemo call-hits`.", 1199 ) 1200 report_parser.add_argument( 1201 "-n", 1202 "--no-recall", 1203 action="store_true", 1204 help="Do not compute motif recall metrics.", 1205 ) 1206 report_parser.add_argument( 1207 "-s", 1208 "--no-seqlets", 1209 action="store_true", 1210 help="DEPRECATED: Please omit the `--modisco-h5` argument instead.", 1211 ) 1212 1213 collapse_hits_parser = subparsers.add_parser( 1214 "collapse-hits", 1215 formatter_class=argparse.ArgumentDefaultsHelpFormatter, 1216 help="Identify best hit by motif similarity among sets of overlapping hits.", 1217 ) 1218 1219 collapse_hits_parser.add_argument( 1220 "-i", 1221 "--hits", 1222 type=str, 1223 required=True, 1224 help="The `hits.tsv` or `hits_unique.tsv` file from `call-hits`.", 1225 ) 1226 collapse_hits_parser.add_argument( 1227 "-o", 1228 "--out-path", 1229 type=str, 1230 required=True, 1231 help='The path to the output .tsv file with an additional "is_primary" column.', 1232 ) 1233 collapse_hits_parser.add_argument( 1234 "-O", 1235 "--overlap-frac", 1236 type=float, 1237 default=inspect.signature(collapse_hits).parameters["overlap_frac"].default, 1238 help="The threshold for determining overlapping hits. For two hits with lengths x and y, the minimum overlap is defined as `overlap_frac * (x + y) / 2`. The default value of 0.2 means that two hits must overlap by at least 20% of their average lengths to be considered overlapping.", 1239 ) 1240 1241 intersect_hits_parser = subparsers.add_parser( 1242 "intersect-hits", 1243 formatter_class=argparse.ArgumentDefaultsHelpFormatter, 1244 help="Intersect hits across multiple runs.", 1245 ) 1246 1247 intersect_hits_parser.add_argument( 1248 "-i", 1249 "--hits", 1250 type=str, 1251 required=True, 1252 nargs="+", 1253 help="One or more hits.tsv or hits_unique.tsv files, with paths delimited by whitespace.", 1254 ) 1255 intersect_hits_parser.add_argument( 1256 "-o", 1257 "--out-path", 1258 type=str, 1259 required=True, 1260 help="The path to the output .tsv file. Duplicate columns are suffixed with the positional index of the input file.", 1261 ) 1262 intersect_hits_parser.add_argument( 1263 "-r", 1264 "--relaxed", 1265 action="store_true", 1266 help="Use relaxed intersection criteria, using only motif names and untrimmed coordinates. By default, the intersection assumes consistent region definitions and motif trimming. This option is not recommended if genomic coordinates are unavailable.", 1267 ) 1268 1269 args = parser.parse_args() 1270 1271 if args.cmd == "extract-regions-bw": 1272 extract_regions_bw( 1273 args.peaks, 1274 args.chrom_order, 1275 args.fasta, 1276 args.bigwigs, 1277 args.out_path, 1278 args.region_width, 1279 ) 1280 1281 elif args.cmd == "extract-regions-chrombpnet-h5": 1282 extract_regions_chrombpnet_h5( 1283 args.peaks, args.chrom_order, args.h5s, args.out_path, args.region_width 1284 ) 1285 1286 elif args.cmd == "extract-regions-h5": 1287 print( 1288 "WARNING: The `extract-regions-h5` command is deprecated. Use `extract-regions-chrombpnet-h5` instead." 1289 ) 1290 extract_regions_chrombpnet_h5( 1291 args.peaks, args.chrom_order, args.h5s, args.out_path, args.region_width 1292 ) 1293 1294 elif args.cmd == "extract-regions-bpnet-h5": 1295 extract_regions_bpnet_h5( 1296 args.peaks, args.chrom_order, args.h5s, args.out_path, args.region_width 1297 ) 1298 1299 elif args.cmd == "extract-regions-modisco-fmt": 1300 extract_regions_modisco_fmt( 1301 args.peaks, 1302 args.chrom_order, 1303 args.attributions, 1304 args.sequences, 1305 args.out_path, 1306 args.region_width, 1307 ) 1308 1309 elif args.cmd == "call-hits": 1310 if args.alpha is not None: 1311 warnings.warn( 1312 "The `--alpha` flag is deprecated and will be removed in a future version. Please use the `--global-lambda` flag instead." 1313 ) 1314 args.global_lambda = args.alpha 1315 if args.motif_alphas is not None: 1316 warnings.warn( 1317 "The `--motif-alphas` flag is deprecated and will be removed in a future version. Please use the `--motif-lambdas` flag instead." 1318 ) 1319 args.motif_lambdas = args.motif_alphas 1320 1321 call_hits( 1322 args.regions, 1323 args.peaks, 1324 args.modisco_h5, 1325 args.chrom_order, 1326 args.motifs_include, 1327 args.motif_names, 1328 args.motif_lambdas, 1329 args.out_dir, 1330 args.cwm_trim_coords, 1331 args.cwm_trim_thresholds, 1332 args.cwm_trim_threshold, 1333 args.global_lambda, 1334 args.step_size_max, 1335 args.step_size_min, 1336 args.sqrt_transform, 1337 args.convergence_tol, 1338 args.max_steps, 1339 args.batch_size, 1340 args.step_adjust, 1341 args.device, 1342 args.mode, 1343 args.no_post_filter, 1344 args.compile, 1345 ) 1346 1347 elif args.cmd == "report": 1348 report( 1349 args.regions, 1350 args.hits, 1351 args.modisco_h5, 1352 args.peaks, 1353 args.motifs_include, 1354 args.motif_names, 1355 args.out_dir, 1356 args.modisco_region_width, 1357 args.cwm_trim_threshold, 1358 not args.no_recall, 1359 not args.no_seqlets, 1360 ) 1361 1362 elif args.cmd == "collapse-hits": 1363 collapse_hits(args.hits, args.out_path, args.overlap_frac) 1364 1365 elif args.cmd == "intersect-hits": 1366 intersect_hits(args.hits, args.out_path, args.relaxed)
25def extract_regions_bw( 26 peaks_path: str, 27 chrom_order_path: Optional[str], 28 fa_path: str, 29 bw_paths: List[str], 30 out_path: str, 31 region_width: int = 1000, 32) -> None: 33 """Extract genomic regions and contribution scores from bigWig and FASTA files. 34 35 Parameters 36 ---------- 37 peaks_path : str 38 Path to ENCODE NarrowPeak format file. 39 chrom_order_path : str, optional 40 Path to chromosome ordering file. 41 fa_path : str 42 Path to genome FASTA file. 43 bw_paths : List[str] 44 List of bigWig file paths containing contribution scores. 45 out_path : str 46 Output path for NPZ file. 47 region_width : int, default 1000 48 Width of regions to extract around peak summits. 49 50 Notes 51 ----- 52 BigWig files only provide projected contribution scores. 53 """ 54 half_width = region_width // 2 55 56 # Load peak regions and extract sequences/contributions 57 peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width) 58 sequences, contribs = data_io.load_regions_from_bw( 59 peaks_df, fa_path, bw_paths, half_width 60 ) 61 62 # Save processed data to NPZ format 63 data_io.write_regions_npz(sequences, contribs, out_path, peaks_df=peaks_df)
Extract genomic regions and contribution scores from bigWig and FASTA files.
Parameters
- peaks_path (str): Path to ENCODE NarrowPeak format file.
- chrom_order_path (str, optional): Path to chromosome ordering file.
- fa_path (str): Path to genome FASTA file.
- bw_paths (List[str]): List of bigWig file paths containing contribution scores.
- out_path (str): Output path for NPZ file.
- region_width (int, default 1000): Width of regions to extract around peak summits.
Notes
BigWig files only provide projected contribution scores.
66def extract_regions_chrombpnet_h5( 67 peaks_path: Optional[str], 68 chrom_order_path: Optional[str], 69 h5_paths: List[str], 70 out_path: str, 71 region_width: int = 1000, 72) -> None: 73 """Extract genomic regions and contribution scores from ChromBPNet HDF5 files. 74 75 Parameters 76 ---------- 77 peaks_path : str, optional 78 Path to ENCODE NarrowPeak format file. If None, lacks absolute coordinates. 79 chrom_order_path : str, optional 80 Path to chromosome ordering file. 81 h5_paths : List[str] 82 List of ChromBPNet HDF5 file paths. 83 out_path : str 84 Output path for NPZ file. 85 region_width : int, default 1000 86 Width of regions to extract around peak summits. 87 """ 88 half_width = region_width // 2 89 90 if peaks_path is not None: 91 peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width) 92 else: 93 peaks_df = None 94 95 sequences, contribs = data_io.load_regions_from_chrombpnet_h5(h5_paths, half_width) 96 97 data_io.write_regions_npz(sequences, contribs, out_path, peaks_df=peaks_df)
Extract genomic regions and contribution scores from ChromBPNet HDF5 files.
Parameters
- peaks_path (str, optional): Path to ENCODE NarrowPeak format file. If None, lacks absolute coordinates.
- chrom_order_path (str, optional): Path to chromosome ordering file.
- h5_paths (List[str]): List of ChromBPNet HDF5 file paths.
- out_path (str): Output path for NPZ file.
- region_width (int, default 1000): Width of regions to extract around peak summits.
100def extract_regions_bpnet_h5( 101 peaks_path: Optional[str], 102 chrom_order_path: Optional[str], 103 h5_paths: List[str], 104 out_path: str, 105 region_width: int = 1000, 106) -> None: 107 """Extract genomic regions and contribution scores from BPNet HDF5 files. 108 109 Parameters 110 ---------- 111 peaks_path : str, optional 112 Path to ENCODE NarrowPeak format file. If None, output lacks absolute coordinates. 113 chrom_order_path : str, optional 114 Path to chromosome ordering file. 115 h5_paths : List[str] 116 List of BPNet HDF5 file paths. 117 out_path : str 118 Output path for NPZ file. 119 region_width : int, default 1000 120 Width of regions to extract around peak summits. 121 """ 122 half_width = region_width // 2 123 124 if peaks_path is not None: 125 peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width) 126 else: 127 peaks_df = None 128 129 sequences, contribs = data_io.load_regions_from_bpnet_h5(h5_paths, half_width) 130 131 data_io.write_regions_npz(sequences, contribs, out_path, peaks_df=peaks_df)
Extract genomic regions and contribution scores from BPNet HDF5 files.
Parameters
- peaks_path (str, optional): Path to ENCODE NarrowPeak format file. If None, output lacks absolute coordinates.
- chrom_order_path (str, optional): Path to chromosome ordering file.
- h5_paths (List[str]): List of BPNet HDF5 file paths.
- out_path (str): Output path for NPZ file.
- region_width (int, default 1000): Width of regions to extract around peak summits.
134def extract_regions_modisco_fmt( 135 peaks_path: Optional[str], 136 chrom_order_path: Optional[str], 137 shaps_paths: List[str], 138 ohe_path: str, 139 out_path: str, 140 region_width: int = 1000, 141) -> None: 142 """Extract genomic regions and contribution scores from TF-MoDISco format files. 143 144 Parameters 145 ---------- 146 peaks_path : str, optional 147 Path to ENCODE NarrowPeak format file. If None, output lacks absolute coordinates. 148 chrom_order_path : str, optional 149 Path to chromosome ordering file. 150 shaps_paths : List[str] 151 List of paths to .npy/.npz files containing SHAP/attribution scores. 152 ohe_path : str 153 Path to .npy/.npz file containing one-hot encoded sequences. 154 out_path : str 155 Output path for NPZ file. 156 region_width : int, default 1000 157 Width of regions to extract around peak summits. 158 """ 159 half_width = region_width // 2 160 161 if peaks_path is not None: 162 peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width) 163 else: 164 peaks_df = None 165 166 sequences, contribs = data_io.load_regions_from_modisco_fmt( 167 shaps_paths, ohe_path, half_width 168 ) 169 170 data_io.write_regions_npz(sequences, contribs, out_path, peaks_df=peaks_df)
Extract genomic regions and contribution scores from TF-MoDISco format files.
Parameters
- peaks_path (str, optional): Path to ENCODE NarrowPeak format file. If None, output lacks absolute coordinates.
- chrom_order_path (str, optional): Path to chromosome ordering file.
- shaps_paths (List[str]): List of paths to .npy/.npz files containing SHAP/attribution scores.
- ohe_path (str): Path to .npy/.npz file containing one-hot encoded sequences.
- out_path (str): Output path for NPZ file.
- region_width (int, default 1000): Width of regions to extract around peak summits.
173def call_hits( 174 regions_path: str, 175 peaks_path: Optional[str], 176 modisco_h5_path: str, 177 chrom_order_path: Optional[str], 178 motifs_include_path: Optional[str], 179 motif_names_path: Optional[str], 180 motif_lambdas_path: Optional[str], 181 out_dir: str, 182 cwm_trim_coords_path: Optional[str] = None, 183 cwm_trim_thresholds_path: Optional[str] = None, 184 cwm_trim_threshold_default: float = 0.3, 185 lambda_default: float = 0.7, 186 step_size_max: float = 3.0, 187 step_size_min: float = 0.08, 188 sqrt_transform: bool = False, 189 convergence_tol: float = 0.0005, 190 max_steps: int = 10000, 191 batch_size: int = 2000, 192 step_adjust: float = 0.7, 193 device: Optional[str] = None, 194 mode: str = "pp", 195 no_post_filter: bool = False, 196 compile_optimizer: bool = False, 197) -> None: 198 """Call motif hits using the Fi-NeMo algorithm on preprocessed genomic regions. 199 200 This function implements the core Fi-NeMo hit calling pipeline, which identifies 201 motif instances by solving a sparse reconstruction problem using proximal gradient 202 descent. The algorithm represents contribution scores as weighted combinations of 203 motif CWMs at specific positions. 204 205 Parameters 206 ---------- 207 regions_path : str 208 Path to NPZ file containing preprocessed regions (sequences, contributions, 209 and optional peak coordinates). 210 peaks_path : str, optional 211 DEPRECATED. Path to ENCODE NarrowPeak format file. Peak data should be 212 included during preprocessing instead. 213 modisco_h5_path : str 214 Path to TF-MoDISco H5 file containing motif CWMs. 215 chrom_order_path : str, optional 216 DEPRECATED. Path to chromosome ordering file. 217 motifs_include_path : str, optional 218 Path to file listing motif names to include in analysis. 219 motif_names_path : str, optional 220 Path to file mapping motif IDs to custom names. 221 motif_lambdas_path : str, optional 222 Path to file specifying per-motif lambda values. 223 out_dir : str 224 Output directory for results. 225 cwm_trim_coords_path : str, optional 226 Path to file specifying custom motif trimming coordinates. 227 cwm_trim_thresholds_path : str, optional 228 Path to file specifying custom motif trimming thresholds. 229 cwm_trim_threshold_default : float, default 0.3 230 Default threshold for motif trimming. 231 lambda_default : float, default 0.7 232 Default L1 regularization weight. 233 step_size_max : float, default 3.0 234 Maximum optimization step size. 235 step_size_min : float, default 0.08 236 Minimum optimization step size. 237 sqrt_transform : bool, default False 238 Whether to apply signed square root transform to contributions. 239 convergence_tol : float, default 0.0005 240 Convergence tolerance for duality gap. 241 max_steps : int, default 10000 242 Maximum number of optimization steps. 243 batch_size : int, default 2000 244 Batch size for GPU processing. 245 step_adjust : float, default 0.7 246 Step size adjustment factor on divergence. 247 device : str, optional 248 DEPRECATED. Use CUDA_VISIBLE_DEVICES environment variable instead. 249 mode : str, default "pp" 250 Contribution type mode ('pp', 'ph', 'hp', 'hh') where 'p'=projected, 'h'=hypothetical. 251 no_post_filter : bool, default False 252 If True, skip post-hit-calling similarity filtering. 253 compile_optimizer : bool, default False 254 Whether to JIT-compile the optimizer for speed. 255 256 Notes 257 ----- 258 The Fi-NeMo algorithm solves the optimization problem: 259 minimize_c: ||contribs - reconstruction(c)||²₂ + λ||c||₁ 260 subject to: c ≥ 0 261 262 where c represents motif hit coefficients and reconstruction uses convolution 263 with motif CWMs. 264 """ 265 266 params = locals() 267 import torch 268 from . import hitcaller 269 270 if device is not None: 271 warnings.warn( 272 "The `--device` flag is deprecated and will be removed in a future version. Please use the `CUDA_VISIBLE_DEVICES` environment variable to specify the GPU device." 273 ) 274 275 sequences, contribs, peaks_df, has_peaks = data_io.load_regions_npz(regions_path) 276 277 region_width = sequences.shape[2] 278 if region_width % 2 != 0: 279 raise ValueError(f"Region width of {region_width} is not divisible by 2.") 280 281 half_width = region_width // 2 282 num_regions = contribs.shape[0] 283 284 if peaks_path is not None: 285 warnings.warn( 286 "Providing a peaks file to `call-hits` is deprecated, and this option will be removed in a future version. Peaks should instead be provided in the preprocessing step to be included in `regions.npz`." 287 ) 288 peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width) 289 has_peaks = True 290 291 if not has_peaks: 292 warnings.warn( 293 "No peak region data provided. Output hits will lack absolute genomic coordinates." 294 ) 295 296 if mode == "pp": 297 motif_type = "cwm" 298 use_hypothetical_contribs = False 299 elif mode == "ph": 300 motif_type = "cwm" 301 use_hypothetical_contribs = True 302 elif mode == "hp": 303 motif_type = "hcwm" 304 use_hypothetical_contribs = False 305 elif mode == "hh": 306 motif_type = "hcwm" 307 use_hypothetical_contribs = True 308 else: 309 raise ValueError( 310 f"Invalid mode: {mode}. Must be one of 'pp', 'ph', 'hp', 'hh'." 311 ) 312 313 if motifs_include_path is not None: 314 motifs_include = data_io.load_txt(motifs_include_path) 315 else: 316 motifs_include = None 317 318 if motif_names_path is not None: 319 motif_name_map = data_io.load_mapping(motif_names_path, str) 320 else: 321 motif_name_map = None 322 323 if motif_lambdas_path is not None: 324 motif_lambdas = data_io.load_mapping(motif_lambdas_path, float) 325 else: 326 motif_lambdas = None 327 328 if cwm_trim_coords_path is not None: 329 trim_coords = data_io.load_mapping_tuple(cwm_trim_coords_path, int) 330 else: 331 trim_coords = None 332 333 if cwm_trim_thresholds_path is not None: 334 trim_thresholds = data_io.load_mapping(cwm_trim_thresholds_path, float) 335 else: 336 trim_thresholds = None 337 338 motifs_df, cwms, trim_masks, _ = data_io.load_modisco_motifs( 339 modisco_h5_path, 340 trim_coords, 341 trim_thresholds, 342 cwm_trim_threshold_default, 343 motif_type, 344 motifs_include, 345 motif_name_map, 346 motif_lambdas, 347 lambda_default, 348 True, 349 ) 350 num_motifs = cwms.shape[0] 351 motif_width = cwms.shape[2] 352 lambdas = motifs_df.get_column("lambda").to_numpy(writable=True) 353 354 device_obj = torch.device(device) if device is not None else None 355 hits_df, qc_df = hitcaller.fit_contribs( 356 cwms, 357 contribs, 358 sequences, 359 trim_masks, 360 use_hypothetical_contribs, 361 lambdas, 362 step_size_max, 363 step_size_min, 364 sqrt_transform, 365 convergence_tol, 366 max_steps, 367 batch_size, 368 step_adjust, 369 not no_post_filter, 370 device_obj, 371 compile_optimizer, 372 ) 373 374 os.makedirs(out_dir, exist_ok=True) 375 out_path_qc = os.path.join(out_dir, "peaks_qc.tsv") 376 data_io.write_hits(hits_df, peaks_df, motifs_df, qc_df, out_dir, motif_width) 377 data_io.write_qc(qc_df, peaks_df, out_path_qc) 378 379 out_path_motif_df = os.path.join(out_dir, "motif_data.tsv") 380 data_io.write_motifs_df(motifs_df, out_path_motif_df) 381 382 out_path_motif_cwms = os.path.join(out_dir, "motif_cwms.npy") 383 data_io.write_motif_cwms(cwms, out_path_motif_cwms) 384 385 params |= { 386 "region_width": region_width, 387 "num_regions": num_regions, 388 "untrimmed_motif_width": motif_width, 389 "num_motifs": num_motifs, 390 } 391 392 out_path_params = os.path.join(out_dir, "parameters.json") 393 data_io.write_params(params, out_path_params)
Call motif hits using the Fi-NeMo algorithm on preprocessed genomic regions.
This function implements the core Fi-NeMo hit calling pipeline, which identifies motif instances by solving a sparse reconstruction problem using proximal gradient descent. The algorithm represents contribution scores as weighted combinations of motif CWMs at specific positions.
Parameters
- regions_path (str): Path to NPZ file containing preprocessed regions (sequences, contributions, and optional peak coordinates).
- peaks_path (str, optional): DEPRECATED. Path to ENCODE NarrowPeak format file. Peak data should be included during preprocessing instead.
- modisco_h5_path (str): Path to TF-MoDISco H5 file containing motif CWMs.
- chrom_order_path (str, optional): DEPRECATED. Path to chromosome ordering file.
- motifs_include_path (str, optional): Path to file listing motif names to include in analysis.
- motif_names_path (str, optional): Path to file mapping motif IDs to custom names.
- motif_lambdas_path (str, optional): Path to file specifying per-motif lambda values.
- out_dir (str): Output directory for results.
- cwm_trim_coords_path (str, optional): Path to file specifying custom motif trimming coordinates.
- cwm_trim_thresholds_path (str, optional): Path to file specifying custom motif trimming thresholds.
- cwm_trim_threshold_default (float, default 0.3): Default threshold for motif trimming.
- lambda_default (float, default 0.7): Default L1 regularization weight.
- step_size_max (float, default 3.0): Maximum optimization step size.
- step_size_min (float, default 0.08): Minimum optimization step size.
- sqrt_transform (bool, default False): Whether to apply signed square root transform to contributions.
- convergence_tol (float, default 0.0005): Convergence tolerance for duality gap.
- max_steps (int, default 10000): Maximum number of optimization steps.
- batch_size (int, default 2000): Batch size for GPU processing.
- step_adjust (float, default 0.7): Step size adjustment factor on divergence.
- device (str, optional): DEPRECATED. Use CUDA_VISIBLE_DEVICES environment variable instead.
- mode (str, default "pp"): Contribution type mode ('pp', 'ph', 'hp', 'hh') where 'p'=projected, 'h'=hypothetical.
- no_post_filter (bool, default False): If True, skip post-hit-calling similarity filtering.
- compile_optimizer (bool, default False): Whether to JIT-compile the optimizer for speed.
Notes
The Fi-NeMo algorithm solves the optimization problem: minimize_c: ||contribs - reconstruction(c)||²₂ + λ||c||₁ subject to: c ≥ 0
where c represents motif hit coefficients and reconstruction uses convolution with motif CWMs.
396def report( 397 regions_path: str, 398 hits_dir: str, 399 modisco_h5_path: Optional[str], 400 peaks_path: Optional[str], 401 motifs_include_path: Optional[str], 402 motif_names_path: Optional[str], 403 out_dir: str, 404 modisco_region_width: int = 400, 405 cwm_trim_threshold: float = 0.3, 406 compute_recall: bool = True, 407 use_seqlets: bool = True, 408) -> None: 409 """Generate comprehensive HTML report with statistics and visualizations. 410 411 This function creates detailed analysis reports comparing Fi-NeMo hit calling 412 results with TF-MoDISco seqlets, including performance metrics, distribution 413 plots, and motif visualization. The report provides insights into hit calling 414 quality and motif discovery accuracy. 415 416 Parameters 417 ---------- 418 regions_path : str 419 Path to NPZ file containing the same regions used for hit calling. 420 hits_dir : str 421 Path to directory containing Fi-NeMo hit calling outputs. 422 modisco_h5_path : str, optional 423 Path to TF-MoDISco H5 file. If None, seqlet comparisons are skipped. 424 peaks_path : str, optional 425 DEPRECATED. Peak coordinates should be included in regions file. 426 motifs_include_path : str, optional 427 DEPRECATED. This information is inferred from hit calling outputs. 428 motif_names_path : str, optional 429 DEPRECATED. This information is inferred from hit calling outputs. 430 out_dir : str 431 Output directory for report files. 432 modisco_region_width : int, default 400 433 Width of regions used by TF-MoDISco (needed for coordinate conversion). 434 cwm_trim_threshold : float, default 0.3 435 DEPRECATED. This information is inferred from hit calling outputs. 436 compute_recall : bool, default True 437 Whether to compute recall metrics against TF-MoDISco seqlets. 438 use_seqlets : bool, default True 439 Whether to include seqlet-based comparisons in the report. 440 441 Notes 442 ----- 443 The generated report includes: 444 - Hit vs seqlet count comparisons 445 - Motif CWM visualizations 446 - Hit statistic distributions 447 - Co-occurrence heatmaps 448 - Confusion matrices for overlapping motifs 449 """ 450 from . import evaluation, visualization 451 452 sequences, contribs, peaks_df, _ = data_io.load_regions_npz(regions_path) 453 if len(contribs.shape) == 3: 454 regions = contribs * sequences 455 elif len(contribs.shape) == 2: 456 regions = contribs[:, None, :] * sequences 457 else: 458 raise ValueError(f"Unexpected contribs shape: {contribs.shape}") 459 460 half_width = regions.shape[2] // 2 461 modisco_half_width = modisco_region_width // 2 462 463 if peaks_path is not None: 464 warnings.warn( 465 "Providing a peaks file to `report` is deprecated, and this option will be removed in a future version. Peaks should instead be provided in the preprocessing step to be included in `regions.npz`." 466 ) 467 peaks_df = data_io.load_peaks(peaks_path, None, half_width) 468 469 if hits_dir.endswith(".tsv"): 470 warnings.warn( 471 "Passing a hits.tsv file to `finemo report` is deprecated. Please provide the directory containing the hits.tsv file instead." 472 ) 473 474 hits_path = hits_dir 475 476 hits_df = data_io.load_hits(hits_path, lazy=True) 477 478 if motifs_include_path is not None: 479 motifs_include = data_io.load_txt(motifs_include_path) 480 else: 481 motifs_include = None 482 483 if motif_names_path is not None: 484 motif_name_map = data_io.load_mapping(motif_names_path, str) 485 else: 486 motif_name_map = None 487 488 if modisco_h5_path is not None: 489 motifs_df, cwms_modisco, _, motif_names = data_io.load_modisco_motifs( 490 modisco_h5_path, 491 None, 492 None, 493 cwm_trim_threshold, 494 "cwm", 495 motifs_include, 496 motif_name_map, 497 None, 498 1.0, 499 True, 500 ) 501 else: 502 # When no modisco_h5_path is provided in legacy TSV mode, we can't compute motifs 503 # This will cause an error later, but that's expected behavior 504 raise ValueError( 505 "modisco_h5_path is required when providing a hits.tsv file directly" 506 ) 507 508 else: 509 hits_df_path = os.path.join(hits_dir, "hits.tsv") 510 hits_df = data_io.load_hits(hits_df_path, lazy=True) 511 512 motifs_df_path = os.path.join(hits_dir, "motif_data.tsv") 513 motifs_df, motif_names = data_io.load_motifs_df(motifs_df_path) 514 515 cwms_modisco_path = os.path.join(hits_dir, "motif_cwms.npy") 516 cwms_modisco = data_io.load_motif_cwms(cwms_modisco_path) 517 518 params_path = os.path.join(hits_dir, "parameters.json") 519 params = data_io.load_params(params_path) 520 cwm_trim_threshold = params["cwm_trim_threshold_default"] 521 522 if not use_seqlets: 523 warnings.warn( 524 "Usage of the `--no-seqlets` flag is deprecated and will be removed in a future version. Please omit the `--modisco-h5` argument instead." 525 ) 526 seqlets_df = None 527 elif modisco_h5_path is None: 528 compute_recall = False 529 seqlets_df = None 530 else: 531 seqlets_df = data_io.load_modisco_seqlets( 532 modisco_h5_path, 533 peaks_df, 534 motifs_df, 535 half_width, 536 modisco_half_width, 537 lazy=True, 538 ) 539 540 motif_width = cwms_modisco.shape[2] 541 542 # Convert to LazyFrame if needed and ensure motif_names is a list 543 if isinstance(hits_df, pl.LazyFrame): 544 hits_df_lazy: pl.LazyFrame = hits_df 545 else: 546 hits_df_lazy: pl.LazyFrame = hits_df.lazy() 547 548 motif_names_list: List[str] = list(motif_names) 549 550 occ_df, coooc = evaluation.get_motif_occurences(hits_df_lazy, motif_names_list) 551 552 report_data, report_df, cwms, trim_bounds = evaluation.tfmodisco_comparison( 553 regions, 554 hits_df, 555 peaks_df, 556 seqlets_df, 557 motifs_df, 558 cwms_modisco, 559 motif_names_list, 560 modisco_half_width, 561 motif_width, 562 compute_recall, 563 ) 564 565 if seqlets_df is not None: 566 confusion_df, confusion_mat = evaluation.seqlet_confusion( 567 hits_df, seqlets_df, peaks_df, motif_names_list, motif_width 568 ) 569 else: 570 confusion_df, confusion_mat = None, None 571 572 os.makedirs(out_dir, exist_ok=True) 573 574 occ_path = os.path.join(out_dir, "motif_occurrences.tsv") 575 data_io.write_occ_df(occ_df, occ_path) 576 577 data_io.write_report_data(report_df, cwms, out_dir) 578 579 visualization.plot_hit_stat_distributions(hits_df_lazy, motif_names_list, out_dir) 580 visualization.plot_hit_peak_distributions(occ_df, motif_names_list, out_dir) 581 visualization.plot_peak_motif_indicator_heatmap(coooc, motif_names_list, out_dir) 582 583 plot_dir = os.path.join(out_dir, "CWMs") 584 visualization.plot_cwms(cwms, trim_bounds, plot_dir) 585 586 if seqlets_df is not None: 587 seqlets_collected = ( 588 seqlets_df.collect() if isinstance(seqlets_df, pl.LazyFrame) else seqlets_df 589 ) 590 seqlets_path = os.path.join(out_dir, "seqlets.tsv") 591 data_io.write_modisco_seqlets(seqlets_collected, seqlets_path) 592 593 if confusion_df is not None and confusion_mat is not None: 594 seqlet_confusion_path = os.path.join(out_dir, "seqlet_confusion.tsv") 595 data_io.write_seqlet_confusion_df(confusion_df, seqlet_confusion_path) 596 597 visualization.plot_hit_vs_seqlet_counts(report_data, out_dir) 598 visualization.plot_seqlet_confusion_heatmap( 599 confusion_mat, motif_names_list, out_dir 600 ) 601 602 report_path = os.path.join(out_dir, "report.html") 603 visualization.write_report( 604 report_df, motif_names_list, report_path, compute_recall, seqlets_df is not None 605 )
Generate comprehensive HTML report with statistics and visualizations.
This function creates detailed analysis reports comparing Fi-NeMo hit calling results with TF-MoDISco seqlets, including performance metrics, distribution plots, and motif visualization. The report provides insights into hit calling quality and motif discovery accuracy.
Parameters
- regions_path (str): Path to NPZ file containing the same regions used for hit calling.
- hits_dir (str): Path to directory containing Fi-NeMo hit calling outputs.
- modisco_h5_path (str, optional): Path to TF-MoDISco H5 file. If None, seqlet comparisons are skipped.
- peaks_path (str, optional): DEPRECATED. Peak coordinates should be included in regions file.
- motifs_include_path (str, optional): DEPRECATED. This information is inferred from hit calling outputs.
- motif_names_path (str, optional): DEPRECATED. This information is inferred from hit calling outputs.
- out_dir (str): Output directory for report files.
- modisco_region_width (int, default 400): Width of regions used by TF-MoDISco (needed for coordinate conversion).
- cwm_trim_threshold (float, default 0.3): DEPRECATED. This information is inferred from hit calling outputs.
- compute_recall (bool, default True): Whether to compute recall metrics against TF-MoDISco seqlets.
- use_seqlets (bool, default True): Whether to include seqlet-based comparisons in the report.
Notes
The generated report includes:
- Hit vs seqlet count comparisons
- Motif CWM visualizations
- Hit statistic distributions
- Co-occurrence heatmaps
- Confusion matrices for overlapping motifs
608def collapse_hits(hits_path: str, out_path: str, overlap_frac: float = 0.2) -> None: 609 """Collapse overlapping hits by selecting the best hit per overlapping group. 610 611 This function processes a set of motif hits and identifies overlapping hits, 612 keeping only the hit with the highest similarity score within each overlapping 613 group. This reduces redundancy in hit calls while preserving the most confident 614 predictions. 615 616 Parameters 617 ---------- 618 hits_path : str 619 Path to input TSV file containing hit data (hits.tsv or hits_unique.tsv). 620 out_path : str 621 Path to output TSV file with additional 'is_primary' column. 622 overlap_frac : float, default 0.2 623 Minimum fractional overlap for considering hits as overlapping. 624 For hits of lengths x and y, minimum overlap = overlap_frac * (x + y) / 2. 625 626 Notes 627 ----- 628 The algorithm uses a sweep line approach with a heap data structure to 629 efficiently identify overlapping intervals and select the best hit based 630 on similarity scores. 631 """ 632 from . import postprocessing 633 634 hits_df = data_io.load_hits(hits_path, lazy=False) 635 hits_collapsed_df = postprocessing.collapse_hits(hits_df, overlap_frac) 636 637 data_io.write_hits_processed( 638 hits_collapsed_df, out_path, schema=data_io.HITS_COLLAPSED_DTYPES 639 )
Collapse overlapping hits by selecting the best hit per overlapping group.
This function processes a set of motif hits and identifies overlapping hits, keeping only the hit with the highest similarity score within each overlapping group. This reduces redundancy in hit calls while preserving the most confident predictions.
Parameters
- hits_path (str): Path to input TSV file containing hit data (hits.tsv or hits_unique.tsv).
- out_path (str): Path to output TSV file with additional 'is_primary' column.
- overlap_frac (float, default 0.2): Minimum fractional overlap for considering hits as overlapping. For hits of lengths x and y, minimum overlap = overlap_frac * (x + y) / 2.
Notes
The algorithm uses a sweep line approach with a heap data structure to efficiently identify overlapping intervals and select the best hit based on similarity scores.
642def intersect_hits(hits_paths: List[str], out_path: str, relaxed: bool = False) -> None: 643 """Find intersection of hits across multiple Fi-NeMo runs. 644 645 This function identifies motif hits that are consistently called across 646 multiple independent runs, providing a way to assess reproducibility and 647 identify high-confidence hits. 648 649 Parameters 650 ---------- 651 hits_paths : List[str] 652 List of paths to input TSV files from different runs. 653 out_path : str 654 Path to output TSV file containing intersection results. 655 Duplicate columns are suffixed with run index. 656 relaxed : bool, default False 657 If True, uses relaxed intersection criteria based only on motif names 658 and untrimmed coordinates. If False, assumes consistent region definitions 659 and motif trimming across runs. 660 661 Notes 662 ----- 663 The strict intersection mode requires consistent input regions and motif 664 processing parameters across all runs. The relaxed mode is more permissive 665 but may not be suitable when genomic coordinates are unavailable. 666 """ 667 from . import postprocessing 668 669 hits_dfs = [data_io.load_hits(hits_path, lazy=False) for hits_path in hits_paths] 670 hits_df = postprocessing.intersect_hits(hits_dfs, relaxed) 671 672 data_io.write_hits_processed(hits_df, out_path, schema=None)
Find intersection of hits across multiple Fi-NeMo runs.
This function identifies motif hits that are consistently called across multiple independent runs, providing a way to assess reproducibility and identify high-confidence hits.
Parameters
- hits_paths (List[str]): List of paths to input TSV files from different runs.
- out_path (str): Path to output TSV file containing intersection results. Duplicate columns are suffixed with run index.
- relaxed (bool, default False): If True, uses relaxed intersection criteria based only on motif names and untrimmed coordinates. If False, assumes consistent region definitions and motif trimming across runs.
Notes
The strict intersection mode requires consistent input regions and motif processing parameters across all runs. The relaxed mode is more permissive but may not be suitable when genomic coordinates are unavailable.
675def cli() -> None: 676 """Command-line interface for the Fi-NeMo motif instance calling pipeline. 677 678 This function provides the main entry point for all Fi-NeMo operations including: 679 - Data preprocessing from various formats (bigWig, HDF5, TF-MoDISco) 680 - Motif hit calling using the Fi-NeMo algorithm 681 - Report generation and visualization 682 - Post-processing operations (hit collapsing, intersection) 683 """ 684 parser = argparse.ArgumentParser() 685 subparsers = parser.add_subparsers(required=True, dest="cmd") 686 687 extract_regions_bw_parser = subparsers.add_parser( 688 "extract-regions-bw", 689 formatter_class=argparse.ArgumentDefaultsHelpFormatter, 690 help="Extract sequences and contributions from FASTA and bigwig files.", 691 ) 692 693 extract_regions_bw_parser.add_argument( 694 "-p", 695 "--peaks", 696 type=str, 697 required=True, 698 help="A peak regions file in ENCODE NarrowPeak format.", 699 ) 700 extract_regions_bw_parser.add_argument( 701 "-C", 702 "--chrom-order", 703 type=str, 704 default=None, 705 help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.", 706 ) 707 extract_regions_bw_parser.add_argument( 708 "-f", 709 "--fasta", 710 type=str, 711 required=True, 712 help="A genome FASTA file. If an .fai index file doesn't exist in the same directory, it will be created.", 713 ) 714 extract_regions_bw_parser.add_argument( 715 "-b", 716 "--bigwigs", 717 type=str, 718 required=True, 719 nargs="+", 720 help="One or more bigwig files of contribution scores, with paths delimited by whitespace. Scores are averaged across files.", 721 ) 722 723 extract_regions_bw_parser.add_argument( 724 "-o", 725 "--out-path", 726 type=str, 727 required=True, 728 help="The path to the output .npz file.", 729 ) 730 731 extract_regions_bw_parser.add_argument( 732 "-w", 733 "--region-width", 734 type=int, 735 default=inspect.signature(extract_regions_bw) 736 .parameters["region_width"] 737 .default, 738 help="The width of the input region centered around each peak summit.", 739 ) 740 741 extract_chrombpnet_regions_h5_parser = subparsers.add_parser( 742 "extract-regions-chrombpnet-h5", 743 formatter_class=argparse.ArgumentDefaultsHelpFormatter, 744 help="Extract sequences and contributions from ChromBPNet contributions H5 files.", 745 ) 746 747 extract_chrombpnet_regions_h5_parser.add_argument( 748 "-p", 749 "--peaks", 750 type=str, 751 default=None, 752 help="A peak regions file in ENCODE NarrowPeak format. If omitted, downstream outputs will lack absolute genomic coordinates.", 753 ) 754 extract_chrombpnet_regions_h5_parser.add_argument( 755 "-C", 756 "--chrom-order", 757 type=str, 758 default=None, 759 help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.", 760 ) 761 762 extract_chrombpnet_regions_h5_parser.add_argument( 763 "-c", 764 "--h5s", 765 type=str, 766 required=True, 767 nargs="+", 768 help="One or more H5 files of contribution scores, with paths delimited by whitespace. Scores are averaged across files.", 769 ) 770 771 extract_chrombpnet_regions_h5_parser.add_argument( 772 "-o", 773 "--out-path", 774 type=str, 775 required=True, 776 help="The path to the output .npz file.", 777 ) 778 779 extract_chrombpnet_regions_h5_parser.add_argument( 780 "-w", 781 "--region-width", 782 type=int, 783 default=inspect.signature(extract_regions_chrombpnet_h5) 784 .parameters["region_width"] 785 .default, 786 help="The width of the input region centered around each peak summit.", 787 ) 788 789 extract_regions_h5_parser = subparsers.add_parser( 790 "extract-regions-h5", 791 formatter_class=argparse.ArgumentDefaultsHelpFormatter, 792 help="Extract sequences and contributions from ChromBPNet contributions H5 files. DEPRECATED: Use `extract-regions-chrombpnet-h5` instead.", 793 ) 794 795 extract_regions_h5_parser.add_argument( 796 "-p", 797 "--peaks", 798 type=str, 799 default=None, 800 help="A peak regions file in ENCODE NarrowPeak format. If omitted, downstream outputs will lack absolute genomic coordinates.", 801 ) 802 extract_regions_h5_parser.add_argument( 803 "-C", 804 "--chrom-order", 805 type=str, 806 default=None, 807 help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.", 808 ) 809 810 extract_regions_h5_parser.add_argument( 811 "-c", 812 "--h5s", 813 type=str, 814 required=True, 815 nargs="+", 816 help="One or more H5 files of contribution scores, with paths delimited by whitespace. Scores are averaged across files.", 817 ) 818 819 extract_regions_h5_parser.add_argument( 820 "-o", 821 "--out-path", 822 type=str, 823 required=True, 824 help="The path to the output .npz file.", 825 ) 826 827 extract_regions_h5_parser.add_argument( 828 "-w", 829 "--region-width", 830 type=int, 831 default=inspect.signature(extract_regions_chrombpnet_h5) 832 .parameters["region_width"] 833 .default, 834 help="The width of the input region centered around each peak summit.", 835 ) 836 837 extract_bpnet_regions_h5_parser = subparsers.add_parser( 838 "extract-regions-bpnet-h5", 839 formatter_class=argparse.ArgumentDefaultsHelpFormatter, 840 help="Extract sequences and contributions from BPNet contributions H5 files.", 841 ) 842 843 extract_bpnet_regions_h5_parser.add_argument( 844 "-p", 845 "--peaks", 846 type=str, 847 default=None, 848 help="A peak regions file in ENCODE NarrowPeak format. If omitted, downstream outputs will lack absolute genomic coordinates.", 849 ) 850 extract_bpnet_regions_h5_parser.add_argument( 851 "-C", 852 "--chrom-order", 853 type=str, 854 default=None, 855 help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.", 856 ) 857 858 extract_bpnet_regions_h5_parser.add_argument( 859 "-c", 860 "--h5s", 861 type=str, 862 required=True, 863 nargs="+", 864 help="One or more H5 files of contribution scores, with paths delimited by whitespace. Scores are averaged across files.", 865 ) 866 867 extract_bpnet_regions_h5_parser.add_argument( 868 "-o", 869 "--out-path", 870 type=str, 871 required=True, 872 help="The path to the output .npz file.", 873 ) 874 875 extract_bpnet_regions_h5_parser.add_argument( 876 "-w", 877 "--region-width", 878 type=int, 879 default=inspect.signature(extract_regions_bpnet_h5) 880 .parameters["region_width"] 881 .default, 882 help="The width of the input region centered around each peak summit.", 883 ) 884 885 extract_regions_modisco_fmt_parser = subparsers.add_parser( 886 "extract-regions-modisco-fmt", 887 formatter_class=argparse.ArgumentDefaultsHelpFormatter, 888 help="Extract sequences and contributions from tfmodisco-lite input files.", 889 ) 890 891 extract_regions_modisco_fmt_parser.add_argument( 892 "-p", 893 "--peaks", 894 type=str, 895 default=None, 896 help="A peak regions file in ENCODE NarrowPeak format. If omitted, downstream outputs will lack absolute genomic coordinates.", 897 ) 898 extract_regions_modisco_fmt_parser.add_argument( 899 "-C", 900 "--chrom-order", 901 type=str, 902 default=None, 903 help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.", 904 ) 905 906 extract_regions_modisco_fmt_parser.add_argument( 907 "-s", 908 "--sequences", 909 type=str, 910 required=True, 911 help="A .npy or .npz file containing one-hot encoded sequences.", 912 ) 913 914 extract_regions_modisco_fmt_parser.add_argument( 915 "-a", 916 "--attributions", 917 type=str, 918 required=True, 919 nargs="+", 920 help="One or more .npy or .npz files of hypothetical contribution scores, with paths delimited by whitespace. Scores are averaged across files.", 921 ) 922 923 extract_regions_modisco_fmt_parser.add_argument( 924 "-o", 925 "--out-path", 926 type=str, 927 required=True, 928 help="The path to the output .npz file.", 929 ) 930 931 extract_regions_modisco_fmt_parser.add_argument( 932 "-w", 933 "--region-width", 934 type=int, 935 default=inspect.signature(extract_regions_modisco_fmt) 936 .parameters["region_width"] 937 .default, 938 help="The width of the input region centered around each peak summit.", 939 ) 940 941 call_hits_parser = subparsers.add_parser( 942 "call-hits", 943 formatter_class=argparse.ArgumentDefaultsHelpFormatter, 944 help="Call hits on provided sequences, contributions, and motif CWM's.", 945 ) 946 947 call_hits_parser.add_argument( 948 "-M", 949 "--mode", 950 type=str, 951 default=inspect.signature(call_hits).parameters["mode"].default, 952 choices={"pp", "ph", "hp", "hh"}, 953 help="The type of attributions to use for CWM's and input contribution scores, respectively. 'h' for hypothetical and 'p' for projected.", 954 ) 955 956 call_hits_parser.add_argument( 957 "-r", 958 "--regions", 959 type=str, 960 required=True, 961 help="A .npz file of input sequences, contributions, and coordinates. Can be generated using `finemo extract-regions-*` subcommands.", 962 ) 963 call_hits_parser.add_argument( 964 "-m", 965 "--modisco-h5", 966 type=str, 967 required=True, 968 help="A tfmodisco-lite output H5 file of motif patterns.", 969 ) 970 971 call_hits_parser.add_argument( 972 "-p", 973 "--peaks", 974 type=str, 975 default=None, 976 help="DEPRECATED: Please provide this file to a preprocessing `finemo extract-regions-*` subcommand instead.", 977 ) 978 call_hits_parser.add_argument( 979 "-C", 980 "--chrom-order", 981 type=str, 982 default=None, 983 help="DEPRECATED: Please provide this file to a preprocessing `finemo extract-regions-*` subcommand instead.", 984 ) 985 986 call_hits_parser.add_argument( 987 "-I", 988 "--motifs-include", 989 type=str, 990 default=None, 991 help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column to include in hit calling. If omitted, all motifs in the modisco H5 file are used.", 992 ) 993 call_hits_parser.add_argument( 994 "-N", 995 "--motif-names", 996 type=str, 997 default=None, 998 help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column and custom names in the second column. Omitted motifs default to tfmodisco names.", 999 ) 1000 1001 call_hits_parser.add_argument( 1002 "-o", 1003 "--out-dir", 1004 type=str, 1005 required=True, 1006 help="The path to the output directory.", 1007 ) 1008 1009 call_hits_parser.add_argument( 1010 "-t", 1011 "--cwm-trim-threshold", 1012 type=float, 1013 default=inspect.signature(call_hits) 1014 .parameters["cwm_trim_threshold_default"] 1015 .default, 1016 help="The default threshold to determine motif start and end positions within the full CWMs.", 1017 ) 1018 call_hits_parser.add_argument( 1019 "-T", 1020 "--cwm-trim-thresholds", 1021 type=str, 1022 default=None, 1023 help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column and custom trim thresholds in the second column. Omitted motifs default to the `--cwm-trim-threshold` value.", 1024 ) 1025 call_hits_parser.add_argument( 1026 "-R", 1027 "--cwm-trim-coords", 1028 type=str, 1029 default=None, 1030 help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column and custom trim start and end coordinates in the second and third columns, respectively. Omitted motifs default to `--cwm-trim-thresholds` values.", 1031 ) 1032 1033 call_hits_parser.add_argument( 1034 "-l", 1035 "--global-lambda", 1036 type=float, 1037 default=inspect.signature(call_hits).parameters["lambda_default"].default, 1038 help="The default L1 regularization weight determining the sparsity of hits.", 1039 ) 1040 call_hits_parser.add_argument( 1041 "-L", 1042 "--motif-lambdas", 1043 type=str, 1044 default=None, 1045 help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column and motif-specific lambdas in the second column. Omitted motifs default to the `--global-lambda` value.", 1046 ) 1047 call_hits_parser.add_argument( 1048 "-a", 1049 "--alpha", 1050 type=float, 1051 default=None, 1052 help="DEPRECATED: Please use the `--lambda` argument instead.", 1053 ) 1054 call_hits_parser.add_argument( 1055 "-A", 1056 "--motif-alphas", 1057 type=str, 1058 default=None, 1059 help="DEPRECATED: Please use the `--motif-lambdas` argument instead.", 1060 ) 1061 1062 call_hits_parser.add_argument( 1063 "-f", 1064 "--no-post-filter", 1065 action="store_true", 1066 help="Do not perform post-hit-calling filtering. By default, hits are filtered based on a minimum cosine similarity of `lambda` with the input contributions.", 1067 ) 1068 call_hits_parser.add_argument( 1069 "-q", 1070 "--sqrt-transform", 1071 action="store_true", 1072 help="Apply a signed square root transform to the input contributions and CWMs before hit calling.", 1073 ) 1074 call_hits_parser.add_argument( 1075 "-s", 1076 "--step-size-max", 1077 type=float, 1078 default=inspect.signature(call_hits).parameters["step_size_max"].default, 1079 help="The maximum optimizer step size.", 1080 ) 1081 call_hits_parser.add_argument( 1082 "-i", 1083 "--step-size-min", 1084 type=float, 1085 default=inspect.signature(call_hits).parameters["step_size_min"].default, 1086 help="The minimum optimizer step size.", 1087 ) 1088 call_hits_parser.add_argument( 1089 "-j", 1090 "--step-adjust", 1091 type=float, 1092 default=inspect.signature(call_hits).parameters["step_adjust"].default, 1093 help="The optimizer step size adjustment factor. If the optimizer diverges, the step size is multiplicatively adjusted by this factor", 1094 ) 1095 call_hits_parser.add_argument( 1096 "-c", 1097 "--convergence-tol", 1098 type=float, 1099 default=inspect.signature(call_hits).parameters["convergence_tol"].default, 1100 help="The tolerance for determining convergence. The optimizer exits when the duality gap is less than the tolerance.", 1101 ) 1102 call_hits_parser.add_argument( 1103 "-S", 1104 "--max-steps", 1105 type=int, 1106 default=inspect.signature(call_hits).parameters["max_steps"].default, 1107 help="The maximum number of optimization steps.", 1108 ) 1109 call_hits_parser.add_argument( 1110 "-b", 1111 "--batch-size", 1112 type=int, 1113 default=inspect.signature(call_hits).parameters["batch_size"].default, 1114 help="The batch size used for optimization.", 1115 ) 1116 call_hits_parser.add_argument( 1117 "-d", 1118 "--device", 1119 type=str, 1120 default=None, 1121 help="DEPRECATED: Please use the `CUDA_VISIBLE_DEVICES` environment variable to specify the GPU device.", 1122 ) 1123 call_hits_parser.add_argument( 1124 "-J", 1125 "--compile", 1126 action="store_true", 1127 help="JIT-compile the optimizer for faster performance. This may not be supported on older GPUs.", 1128 ) 1129 1130 report_parser = subparsers.add_parser( 1131 "report", 1132 formatter_class=argparse.ArgumentDefaultsHelpFormatter, 1133 help="Generate statistics and visualizations from hits and tfmodisco-lite motif data.", 1134 ) 1135 1136 report_parser.add_argument( 1137 "-r", 1138 "--regions", 1139 type=str, 1140 required=True, 1141 help="A .npz file containing input sequences, contributions, and coordinates. Must be the same as that used for `finemo call-hits`.", 1142 ) 1143 report_parser.add_argument( 1144 "-H", 1145 "--hits", 1146 type=str, 1147 required=True, 1148 help="The output directory generated by the `finemo call-hits` command on the regions specified in `--regions`.", 1149 ) 1150 report_parser.add_argument( 1151 "-p", 1152 "--peaks", 1153 type=str, 1154 default=None, 1155 help="DEPRECATED: Please provide this file to a preprocessing `finemo extract-regions-*` subcommand instead.", 1156 ) 1157 report_parser.add_argument( 1158 "-m", 1159 "--modisco-h5", 1160 type=str, 1161 default=None, 1162 help="The tfmodisco-lite output H5 file of motif patterns. Must be the same as that used for hit calling unless `--no-recall` is set. If omitted, seqlet-derived metrics will not be computed.", 1163 ) 1164 report_parser.add_argument( 1165 "-I", 1166 "--motifs-include", 1167 type=str, 1168 default=None, 1169 help="DEPRECATED: This information is now inferred from the outputs of `finemo call-hits`.", 1170 ) 1171 report_parser.add_argument( 1172 "-N", 1173 "--motif-names", 1174 type=str, 1175 default=None, 1176 help="DEPRECATED: This information is now inferred from the outputs of `finemo call-hits`.", 1177 ) 1178 1179 report_parser.add_argument( 1180 "-o", 1181 "--out-dir", 1182 type=str, 1183 required=True, 1184 help="The path to the report output directory.", 1185 ) 1186 1187 report_parser.add_argument( 1188 "-W", 1189 "--modisco-region-width", 1190 type=int, 1191 default=inspect.signature(report).parameters["modisco_region_width"].default, 1192 help="The width of the region around each peak summit used by tfmodisco-lite.", 1193 ) 1194 report_parser.add_argument( 1195 "-t", 1196 "--cwm-trim-threshold", 1197 type=float, 1198 default=inspect.signature(report).parameters["cwm_trim_threshold"].default, 1199 help="DEPRECATED: This information is now inferred from the outputs of `finemo call-hits`.", 1200 ) 1201 report_parser.add_argument( 1202 "-n", 1203 "--no-recall", 1204 action="store_true", 1205 help="Do not compute motif recall metrics.", 1206 ) 1207 report_parser.add_argument( 1208 "-s", 1209 "--no-seqlets", 1210 action="store_true", 1211 help="DEPRECATED: Please omit the `--modisco-h5` argument instead.", 1212 ) 1213 1214 collapse_hits_parser = subparsers.add_parser( 1215 "collapse-hits", 1216 formatter_class=argparse.ArgumentDefaultsHelpFormatter, 1217 help="Identify best hit by motif similarity among sets of overlapping hits.", 1218 ) 1219 1220 collapse_hits_parser.add_argument( 1221 "-i", 1222 "--hits", 1223 type=str, 1224 required=True, 1225 help="The `hits.tsv` or `hits_unique.tsv` file from `call-hits`.", 1226 ) 1227 collapse_hits_parser.add_argument( 1228 "-o", 1229 "--out-path", 1230 type=str, 1231 required=True, 1232 help='The path to the output .tsv file with an additional "is_primary" column.', 1233 ) 1234 collapse_hits_parser.add_argument( 1235 "-O", 1236 "--overlap-frac", 1237 type=float, 1238 default=inspect.signature(collapse_hits).parameters["overlap_frac"].default, 1239 help="The threshold for determining overlapping hits. For two hits with lengths x and y, the minimum overlap is defined as `overlap_frac * (x + y) / 2`. The default value of 0.2 means that two hits must overlap by at least 20% of their average lengths to be considered overlapping.", 1240 ) 1241 1242 intersect_hits_parser = subparsers.add_parser( 1243 "intersect-hits", 1244 formatter_class=argparse.ArgumentDefaultsHelpFormatter, 1245 help="Intersect hits across multiple runs.", 1246 ) 1247 1248 intersect_hits_parser.add_argument( 1249 "-i", 1250 "--hits", 1251 type=str, 1252 required=True, 1253 nargs="+", 1254 help="One or more hits.tsv or hits_unique.tsv files, with paths delimited by whitespace.", 1255 ) 1256 intersect_hits_parser.add_argument( 1257 "-o", 1258 "--out-path", 1259 type=str, 1260 required=True, 1261 help="The path to the output .tsv file. Duplicate columns are suffixed with the positional index of the input file.", 1262 ) 1263 intersect_hits_parser.add_argument( 1264 "-r", 1265 "--relaxed", 1266 action="store_true", 1267 help="Use relaxed intersection criteria, using only motif names and untrimmed coordinates. By default, the intersection assumes consistent region definitions and motif trimming. This option is not recommended if genomic coordinates are unavailable.", 1268 ) 1269 1270 args = parser.parse_args() 1271 1272 if args.cmd == "extract-regions-bw": 1273 extract_regions_bw( 1274 args.peaks, 1275 args.chrom_order, 1276 args.fasta, 1277 args.bigwigs, 1278 args.out_path, 1279 args.region_width, 1280 ) 1281 1282 elif args.cmd == "extract-regions-chrombpnet-h5": 1283 extract_regions_chrombpnet_h5( 1284 args.peaks, args.chrom_order, args.h5s, args.out_path, args.region_width 1285 ) 1286 1287 elif args.cmd == "extract-regions-h5": 1288 print( 1289 "WARNING: The `extract-regions-h5` command is deprecated. Use `extract-regions-chrombpnet-h5` instead." 1290 ) 1291 extract_regions_chrombpnet_h5( 1292 args.peaks, args.chrom_order, args.h5s, args.out_path, args.region_width 1293 ) 1294 1295 elif args.cmd == "extract-regions-bpnet-h5": 1296 extract_regions_bpnet_h5( 1297 args.peaks, args.chrom_order, args.h5s, args.out_path, args.region_width 1298 ) 1299 1300 elif args.cmd == "extract-regions-modisco-fmt": 1301 extract_regions_modisco_fmt( 1302 args.peaks, 1303 args.chrom_order, 1304 args.attributions, 1305 args.sequences, 1306 args.out_path, 1307 args.region_width, 1308 ) 1309 1310 elif args.cmd == "call-hits": 1311 if args.alpha is not None: 1312 warnings.warn( 1313 "The `--alpha` flag is deprecated and will be removed in a future version. Please use the `--global-lambda` flag instead." 1314 ) 1315 args.global_lambda = args.alpha 1316 if args.motif_alphas is not None: 1317 warnings.warn( 1318 "The `--motif-alphas` flag is deprecated and will be removed in a future version. Please use the `--motif-lambdas` flag instead." 1319 ) 1320 args.motif_lambdas = args.motif_alphas 1321 1322 call_hits( 1323 args.regions, 1324 args.peaks, 1325 args.modisco_h5, 1326 args.chrom_order, 1327 args.motifs_include, 1328 args.motif_names, 1329 args.motif_lambdas, 1330 args.out_dir, 1331 args.cwm_trim_coords, 1332 args.cwm_trim_thresholds, 1333 args.cwm_trim_threshold, 1334 args.global_lambda, 1335 args.step_size_max, 1336 args.step_size_min, 1337 args.sqrt_transform, 1338 args.convergence_tol, 1339 args.max_steps, 1340 args.batch_size, 1341 args.step_adjust, 1342 args.device, 1343 args.mode, 1344 args.no_post_filter, 1345 args.compile, 1346 ) 1347 1348 elif args.cmd == "report": 1349 report( 1350 args.regions, 1351 args.hits, 1352 args.modisco_h5, 1353 args.peaks, 1354 args.motifs_include, 1355 args.motif_names, 1356 args.out_dir, 1357 args.modisco_region_width, 1358 args.cwm_trim_threshold, 1359 not args.no_recall, 1360 not args.no_seqlets, 1361 ) 1362 1363 elif args.cmd == "collapse-hits": 1364 collapse_hits(args.hits, args.out_path, args.overlap_frac) 1365 1366 elif args.cmd == "intersect-hits": 1367 intersect_hits(args.hits, args.out_path, args.relaxed)
Command-line interface for the Fi-NeMo motif instance calling pipeline.
This function provides the main entry point for all Fi-NeMo operations including:
- Data preprocessing from various formats (bigWig, HDF5, TF-MoDISco)
- Motif hit calling using the Fi-NeMo algorithm
- Report generation and visualization
- Post-processing operations (hit collapsing, intersection)