Skip to content

Commit 71c1950

Browse files
authored
Panel design notebook mark II (#157)
1 parent 426eae2 commit 71c1950

59 files changed

Lines changed: 109841 additions & 1440 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

CHANGELOG.md

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,17 @@ This project adheres to [Semantic Versioning](http://semver.org/).
44

55
## [Unreleased]
66

7+
### Added
8+
- Panel design notebooks (see #157).
9+
710
### Fixed
8-
- Debugged a test that counts observed haplotypes (#154).
9-
- Replaced global pooled Ae values with 26-population average as the default Ae reported (#155, #158).
10-
- Replaced deprecated `pkg_resources` module with `importlib.resources` (#156).
11-
- Upgraded versioneer to a Python 3.12+ compatible version (#156).
11+
- Debugged a test that counts observed haplotypes (see #154).
12+
- Replaced global pooled Ae values with 26-population average as the default Ae reported (see #155, #158).
13+
- Replaced deprecated `pkg_resources` module with `importlib.resources` (see #156).
14+
- Upgraded versioneer to a Python 3.12+ compatible version (see #156).
15+
16+
### Removed
17+
- Table flagging microhaps with repetitive content (see #153, #157).
1218

1319

1420
## [0.11] 2023-10-25
@@ -21,13 +27,13 @@ This project adheres to [Semantic Versioning](http://semver.org/).
2127
- Merged RSIDs resolved during database build now propagated to the final marker definition (see #149).
2228

2329
### Fixed
24-
- Added manual and automated fixes to ensure frequencies are formatted correcly and matche to the correct marker definition (see #150).
30+
- Added manual and automated fixes to ensure frequencies are formatted correcly and matched to the correct marker definition (see #150).
2531

2632

2733
## [0.10.1] 2023-10-13
2834

2935
### Fixed
30-
- Bug with offsets table (`marker --format=offsets`) when multiple markers are defined for a locus (#144).
36+
- Bug with offsets table (`marker --format=offsets`) when multiple markers are defined for a locus (see #144).
3137

3238

3339
## [0.10] 2023-09-15

dbbuild/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ The goal is that—if ever needed, heaven forbid—any reasonbly capable bioinfo
2929
The MicroHapDB database can be rebuilt with the following command in the `dbbuild/` directory.
3030

3131
```
32-
./build.py databases/dbSNP/ databases/chains/ | tee build-summary.txt
32+
./build.py databases/dbSNP/ databases/chains/ --exclude Auton2015 | tee build-summary.txt
3333
```
3434

3535
The arguments provided to the build script will depend on the location of the dbSNP files and liftover chain files on the system.

dbbuild/build.py

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,11 @@
1515
from argparse import ArgumentParser
1616
from lib import SourceIndex
1717
from pathlib import Path
18-
from repeats import main as flag_repeats
1918
import sys
2019

2120

22-
def main(
23-
source_path, dbsnp_path, chain_path, rmsk_path, exclusions=["Auton2015"], check_only=False
24-
):
25-
validate_paths(dbsnp_path, chain_path, rmsk_path)
21+
def main(source_path, dbsnp_path, chain_path, exclusions=["Auton2015"], check_only=False):
22+
validate_paths(dbsnp_path, chain_path)
2623
if check_only:
2724
return
2825
index = SourceIndex(source_path, dbsnp_path, chain_path, exclude=exclusions)
@@ -35,12 +32,10 @@ def main(
3532
frequencies.to_csv("frequency.csv.gz", index=False, float_format="%.5f", compression="gzip")
3633
index.populations.to_csv("population.csv", index=False)
3734
index.merges.to_csv("merged.csv", index=False)
38-
repeats = flag_repeats(Path(rmsk_path) / "rmsk.txt.gz", "marker.csv", delta=25)
39-
repeats.to_csv("repeats.csv", index=False)
4035
print(index)
4136

4237

43-
def validate_paths(dbsnp_path, rmsk_path, chain_path):
38+
def validate_paths(dbsnp_path, chain_path):
4439
paths = list()
4540
for version in (37, 38):
4641
for extension in ("vcf.gz", "vcf.gz.tbi", "rsidx"):
@@ -49,7 +44,6 @@ def validate_paths(dbsnp_path, rmsk_path, chain_path):
4944
paths.append(Path(dbsnp_path) / "refsnp-merged.csv.gz")
5045
paths.append(Path(chain_path) / "hg19ToHg38.over.chain.gz")
5146
paths.append(Path(chain_path) / "hg38ToHg19.over.chain.gz")
52-
paths.append(Path(rmsk_path) / "rmsk.txt.gz")
5347
files_present = [p.is_file() for p in paths]
5448
print("-" * 60, "[Auxiliary data file check]\n", "Present Path", sep="\n", file=sys.stderr)
5549
for path, present in zip(paths, files_present):
@@ -81,7 +75,6 @@ def get_parser():
8175
parser = ArgumentParser(description="MicroHapDB database build procedure")
8276
parser.add_argument("dbsnp_path")
8377
parser.add_argument("chain_path")
84-
parser.add_argument("rmsk_path")
8578
parser.add_argument(
8679
"--sources",
8780
default="sources",
@@ -107,7 +100,6 @@ def get_parser():
107100
args.sources,
108101
args.dbsnp_path,
109102
args.chain_path,
110-
args.rmsk_path,
111103
exclusions=args.exclude,
112104
check_only=args.check,
113105
)

dbbuild/repeats.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,12 @@ def parse_ucsc_rmsk_track(path):
5858
"id",
5959
]
6060
table = pd.read_csv(path, sep="\t", names=header)
61+
table = table[
62+
(~table.repClass.isin(("SINE", "LINE", "LTR")))
63+
| ((table.repClass == "SINE") & (table.swScore > 929))
64+
| ((table.repClass == "LINE") & (table.swScore > 411))
65+
| ((table.repClass == "LTR") & (table.swScore > 909))
66+
]
6167
return table.groupby("genoName")
6268

6369

microhapdb/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
# -------------------------------------------------------------------------------------------------
1212

1313
from . import nomenclature
14-
from .tables import markers, merged, populations, frequencies, repeats, indels, variantmap, hg38
14+
from .tables import markers, merged, populations, frequencies, indels, variantmap, hg38
1515
from .population import Population
1616
from .marker import Marker, Locus
1717
from microhapdb import cli

microhapdb/cli/marker.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
import pandas as pd
1818
import sys
1919
from textwrap import dedent
20-
from warnings import warn
2120

2221

2322
def main(args):
@@ -96,7 +95,7 @@ def display(
9695
for marker in markers:
9796
loci[marker.locus].markers.append(marker)
9897
table = pd.concat([locus.definition for locus in loci.values()])
99-
table = table.rename(columns={"ChromOffset": f"OffsetHg38"})
98+
table = table.rename(columns={"ChromOffset": "OffsetHg38"})
10099
table.to_csv(sys.stdout, sep="\t", index=False)
101100
else:
102101
raise ValueError(f'unsupported view format "{view_format}"')

0 commit comments

Comments
 (0)