Konwertowanie DNA w formacie 2bit na ciąg binarny

osobliwy nick

2024-09-26T02:51:18+02:00

Rejestracja: dni
Ostatnio: dni
Postów: 34

0

Pobrałem sobie jakiś genom (chyba, bo nie mam pojęcia co dokładnie jest w pliku):

https://hgdownload.soe.ucsc.edu/goldenPath/hs1/bigZips/

i usiłuję go przeczytać w celu zamiany zasad, których się tam spodziewam w ciągu typu:

gcctagtacagactctccctgcagatgaaattatatgggatgctaaatta

na liczby binarne. Ale nie wiem jak otworzyć ten format, ani nie znalazłem w Internecie niczego co by działało jak dotychczas. Tu znalazłem jakiś kod:

https://pythonhosted.org/twobitreader/twobitreader.html

Pokopiowałemm to jedno po drugim (nie wiem, czy autor nie przewidział, że ktoś będzie chciał skopiować cały kod na raz, czy coś robię źle) i spróbowałem tak:

Kopiuj

"""
twobitreader

Licensed under Perl Artistic License 2.0
No warranty is provided, express or implied
"""
from array import array
from bisect import bisect_right
from errno import ENOENT, EACCES
from os import R_OK, access

try:
    from os import strerror
except ImportError:
    strerror = lambda x: 'strerror not supported'
from os.path import exists


class TwoBitSequence(object):
    """
A TwoBitSequence object refers to an entry in a TwoBitFile

You may access intervals by slicing or using str() to dump the entire entry
e.g.
>>> genome = TwoBitFile('hg18.2bit')
>>> chr20 = genome['chr20']
>>> chr20[100100:100200] # slicing returns a string
'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggtat
ttgcatgttaagtttttttcc'
>>> whole_chr20 = str(chr20) # get whole chr as string

Fair warning: dumping the entire chromosome requires a lot of memory

Note that we follow python/UCSC conventions:
Coordinates are 0-based, end-open
(Note: The UCSC web-based genome browser uses 1-based closed coordinates)
If you attempt to access a slice past the end of the sequence,
it will be truncated at the end.

Your computer probably doesn't have enough memory to load a whole genome
but if you want to string-ize your TwoBitFile, here's a recipe:

x = TwoBitFile('my.2bit')
d = x.dict()
for k,v in d.iteritems(): d[k] = str(v)
    """

    def __init__(self, file_handle, offset, byteswapped=False):
        self._file_handle = file_handle
        self._original_offset = offset
        self._byteswapped = byteswapped
        file_handle.seek(offset)
        header = array(LONG)
        header.fromfile(file_handle, 2)
        if byteswapped: header.byteswap()
        dna_size, n_block_count = header
        self._dna_size = dna_size
        self._packed_dna_size = (dna_size + 15) / 16  # this is 32-bit fragments
        n_block_starts = array(LONG)
        n_block_sizes = array(LONG)
        n_block_starts.fromfile(file_handle, n_block_count)
        if byteswapped: n_block_starts.byteswap()
        n_block_sizes.fromfile(file_handle, n_block_count)
        if byteswapped: n_block_sizes.byteswap()
        self._n_block_starts = n_block_starts
        self._n_block_sizes = n_block_sizes
        mask_rawc = array(LONG)
        mask_rawc.fromfile(file_handle, 1)
        if byteswapped: mask_rawc.byteswap()
        mask_block_count = mask_rawc[0]
        mask_block_starts = array(LONG)
        mask_block_starts.fromfile(file_handle, mask_block_count)
        if byteswapped: mask_block_starts.byteswap()
        mask_block_sizes = array(LONG)
        mask_block_sizes.fromfile(file_handle, mask_block_count)
        if byteswapped: mask_block_sizes.byteswap()
        self._mask_block_starts = mask_block_starts
        self._mask_block_sizes = mask_block_sizes
        file_handle.read(4)
        self._offset = file_handle.tell()

    def __len__(self):
        return self._dna_size

    def __getslice__(self, min_, max_=None):
        return self.get_slice(min_, max_)


def get_slice(self, min_, max_=None):
    """
    get_slice returns only a sub-sequence
    """
    # handle negative coordinates
    dna_size = self._dna_size
    if max_ < 0:
        if max_ < -dna_size: raise IndexError('index out of range')
        max_ = dna_size + 1 + max_
    if min_ < 0:
        if max_ < -dna_size: raise IndexError('index out of range')
        min_ = dna_size + 1 + min_
    # make sure there's a proper range
    if min_ > max_ and max_ is not None: return ''
    if max_ == 0: return ''
    # load all the data
    if max_ > dna_size: max_ = dna_size
    file_handle = self._file_handle
    byteswapped = self._byteswapped
    n_block_starts = self._n_block_starts
    n_block_sizes = self._n_block_sizes
    mask_block_starts = self._mask_block_starts
    mask_block_sizes = self._mask_block_sizes
    offset = self._offset
    packed_dna_size = self._packed_dna_size

    # region_size is how many bases the region is
    if max_ is None:
        region_size = dna_size - min_
    else:
        region_size = max_ - min_

    # start_block, end_block are the first/last 32-bit blocks we need
    # note: end_block is not read
    # blocks start at 0
    start_block = min_ / 16
    end_block = max_ / 16
    # don't read past seq end
    if end_block >= packed_dna_size: end_block = packed_dna_size - 1
    # +1 we still need to read block
    blocks_to_read = end_block - start_block + 1

    # jump directly to desired file location
    local_offset = offset + start_block * 4
    file_handle.seek(local_offset)

    # note we won't actually read the last base
    # this is a python slice first_base_offset:16*blocks+last_base_offset
    first_base_offset = min_ % 16
    last_base_offset = max_ % 16

    fourbyte_dna = array(LONG)
    fourbyte_dna.fromfile(file_handle, blocks_to_read)
    if byteswapped: fourbyte_dna.byteswap()
    string_as_array = longs_to_char_array(fourbyte_dna, first_base_offset,
                                          last_base_offset, region_size)
    for start, size in izip(n_block_starts, n_block_sizes):
        end = start + size
        if end <= min_: continue
        if start > max_: break
        if start < min_: start = min_
        if end > max_: end = max_
        start -= min_
        end -= min_
        string_as_array[start:end] = array('c', 'N' * (end - start))
    lower = str.lower
    first_masked_region = max(0,
                              bisect_right(mask_block_starts, min_) - 1)
    last_masked_region = min(len(mask_block_starts),
                             1 + bisect_right(mask_block_starts, max_,
                                              lo=first_masked_region))
    for start, size in izip(mask_block_starts[first_masked_region:last_masked_region],
                            mask_block_sizes[first_masked_region:last_masked_region]):
        end = start + size
        if end <= min_: continue
        if start > max_: break
        if start < min_: start = min_
        if end > max_: end = max_
        start -= min_
        end -= min_
        string_as_array[start:end] = array('c', lower(string_as_array[start:end].tostring()))
    #if not len(string_as_array) == max_ - min_:
        #raise RuntimeError, "Sequence was longer than it should be"
    return string_as_array.tostring()


def __str__(self):
    """
    returns the entire chromosome
    """
    return self.__getslice__(0, None)

def true_long_type():
    """
    OS X uses an 8-byte long, so make sure L (long) is the right size
    and switch to I (int) if needed
    """
    for type_ in ['L', 'I']:
        test_array = array(type_, [0])
        long_size = test_array.itemsize
        if long_size == 4: return type_
    raise ImportError("Couldn't determine a valid 4-byte long type to use \
as equivalent to LONG")


LONG = true_long_type()

def byte_to_bases(x):
    """convert one byte to the four bases it encodes"""
    c = (x >> 4) & 0xf
    f = x & 0xf
    cc = (c >> 2) & 0x3
    cf = c & 0x3
    fc = (f >> 2) & 0x3
    ff = f & 0x3
    return list(map(bits_to_base, (cc, cf, fc, ff)))


def bits_to_base(x):
    """convert integer representation of two bits to correct base"""
    if x == 0: return 'T'
    if x == 1: return 'C'
    if x == 2: return 'A'
    if x == 3: return 'G'


def base_to_bin(x):
    """
    provided for user convenience
    convert a nucleotide to its bit representation
    """
    if x == 'T': return '00'
    if x == 'C': return '01'
    if x == 'A': return '10'
    if x == 'G': return '11'



def create_byte_table():
    """create BYTE_TABLE"""
    d = {}
    for x in range(2 ** 8):
        d[x] = byte_to_bases(x)
    return d



def split16(x):
    """
    split a 16-bit number into integer representation
    of its course and fine parts in binary representation
    """
    c = (x >> 8) & 0xff
    f = x & 0xff
    return c, f



def create_twobyte_table():
    """create TWOBYTE_TABLE"""
    d = {}
    for x in range(2 ** 16):
        c, f = split16(x)
        d[x] = byte_to_bases(c) + byte_to_bases(f)
    return d


BYTE_TABLE = create_byte_table()
TWOBYTE_TABLE = create_twobyte_table()


def longs_to_char_array(longs, first_base_offset, last_base_offset, array_size):
    """
    takes in a iterable of longs and converts them to bases in a char array
    returns a ctypes string buffer
    """
    longs_len = len(longs)
    # dna = ctypes.create_string_buffer(array_size)
    dna = array('c', 'N' * longs_len)
    # translate from 32-bit blocks to bytes
    # this method ensures correct endianess (byteswap as neeed)
    bytes = array('B')
    bytes.fromstring(longs.tostring())
    # first block
    first_block = ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(4)])
    i = 16 - first_base_offset
    if array_size < i: i = array_size
    dna[0:i] = array('c', first_block[first_base_offset:first_base_offset + i])
    if longs_len == 1: return dna
    # middle blocks (implicitly skipped if they don't exist)
    for byte in bytes[4:-4]:
        dna[i:i + 4] = array('c', BYTE_TABLE[byte])
        i += 4
    # last block
    last_block = array('c', ''.join([''.join(BYTE_TABLE[bytes[x]]) for x in range(-4, 0)]))
    dna[i:i + last_base_offset] = last_block[0:last_base_offset]
    return dna

class TwoBitFile(dict):
    """
python-level reader for .2bit files (i.e., from UCSC genome browser)
(note: no writing support)

TwoBitFile inherits from dict
You may access sequences by name, e.g.
>>> genome = TwoBitFile('hg18.2bit')
>>> chr20 = genome['chr20']

Sequences are returned as TwoBitSequence objects
You may access intervals by slicing or using str() to dump the entire entry
e.g.
>>> chr20[100100:100200]
'ttttcctctaagataatttttgccttaaatactattttgttcaatactaagaagtaagataacttccttttgttggtat
ttgcatgttaagtttttttcc'
>>> whole_chr20 = str(chr20)

Fair warning: dumping the entire chromosome requires a lot of memory

See TwoBitSequence for more info
    """

    def __init__(self, foo):
        super(TwoBitFile, self).__init__()
        if not exists(foo):
            raise IOError(ENOENT, strerror(ENOENT), foo)
        if not access(foo, R_OK):
            raise IOError(EACCES, strerror(EACCES), foo)
        self._filename = foo
        self._file_handle = open(foo, 'rb')
        self._load_header()
        self._load_index()
        for name, offset in self._offset_dict.items():
            self[name] = TwoBitSequence(self._file_handle, offset, self._byteswapped)
        return

    def _load_header(self):
        file_handle = self._file_handle
        header = array(LONG)
        header.fromfile(file_handle, 4)
        # check signature -- must be 0x1A412743
        # if not, swap bytes
        byteswapped = False
        (signature, version, sequence_count, reserved) = header
        if not signature == 0x1A412743:
            byteswapped = True
            header.byteswap()
            (signature2, version, sequence_count, reserved) = header
            if not signature2 == 0x1A412743:
                raise TwoBitFileError('Signature in header should be 0x1A412743'
                                      + ', instead found 0x%X' % signature)
        if not version == 0:
            raise TwoBitFileError('File version in header should be 0.')
        if not reserved == 0:
            raise TwoBitFileError('Reserved field in header should be 0.')
        self._byteswapped = byteswapped
        self._sequence_count = sequence_count

    def _load_index(self):
        file_handle = self._file_handle
        byteswapped = self._byteswapped
        remaining = self._sequence_count
        sequence_offsets = []
        file_handle.seek(16)
        while True:
            if remaining == 0: break
            name_size = array('B')
            name_size.fromfile(file_handle, 1)
            if byteswapped: name_size.byteswap()
            name = array('b')
            if byteswapped: name.byteswap()
            name.fromfile(file_handle, name_size[0])
            offset = array(LONG)
            offset.fromfile(file_handle, 1)
            if byteswapped: offset.byteswap()
            sequence_offsets.append((name.tobytes(), offset[0]))
            remaining -= 1
        self._sequence_offsets = sequence_offsets
        self._offset_dict = dict(sequence_offsets)

def sequence_sizes(self):
    """returns a dictionary with the sizes of each sequence"""
    d = {}
    file_handle = self._file_handle
    byteswapped = self._byteswapped
    for name, offset in self._offset_dict.iteritems():
        file_handle.seek(offset)
        dna_size = array(LONG)
        dna_size.fromfile(file_handle, 1)
        if byteswapped: dna_size.byteswap()
        d[name] = dna_size[0]
    return d


genome = TwoBitFile(r'C:\Users\48791\Downloads\hs1.2bit')
chr20 = genome['chr20']
print(str(chr20))

Dostaję błąd "KeyError: 'chr20'". Jak użyć tego kodu poprawnie? A może ktoś ma inny pomysł jak otworzyć ten format w celu przekonwertowania go na coś normalnego, choćby txt?

johnny_Be_good

2024-09-26T05:45:36+02:00

Rejestracja: dni
Ostatnio: dni
Lokalizacja: Holandia
Postów: 853

1

To nie jest dobry kod, użyj GPT

osobliwy nick

2024-09-26T07:16:25+02:00

Rejestracja: dni
Ostatnio: dni
Postów: 34

0

Napisał mi taki kod:

Kopiuj

import py2bit


def read_first_100_bases(file_path):
    try:
        # Otwieramy plik .2bit
        tbf = py2bit.open(file_path)

        # Uzyskujemy listę chromosomów dostępnych w pliku
        chroms = tbf.chroms()

        # Wybieramy pierwszy chromosom (możesz zmienić na dowolny)
        first_chrom = list(chroms.keys())[0]
        chrom_length = chroms[first_chrom]

        print(f'Chromosom: {first_chrom}, Długość: {chrom_length}')

        # Odczytujemy pierwsze 100 zasad (nukleotydów)
        num_bases = 100
        if chrom_length < num_bases:
            num_bases = chrom_length  # Jeśli chromosom jest krótszy niż 100 zasad

        sequence = tbf.sequence(first_chrom, 0, num_bases)

        print(f'Pierwsze {num_bases} zasady w chromosomie {first_chrom}:')
        print(sequence)

        tbf.close()
    except Exception as e:
        print(f"Wystąpił błąd: {e}")


# Przykład użycia
file_path = r'C:\Users\48791\Downloads\hs1.2bit'
read_first_100_bases(file_path)

Ale dostaję błąd ModuleNotFoundError: No module named 'py2bit', pomimo, że mam zainstolowany py2bit. I tego już nie potrafi rozwiązać. Więc nie wiem, czy ten kod działa.

lion137

2024-09-26T09:11:48+02:00

Rejestracja: dni
Ostatnio: dni
Postów: 5025

0

Z Pythonem3.12 działa:

Kopiuj

(venv) ➜  tests python3.12 -m venv venv
(venv) ➜  tests source venv/bin/activate
(venv) ➜  tests pip install git+https://github.com/dpryan79/py2bit
Collecting git+https://github.com/dpryan79/py2bit
  Cloning https://github.com/dpryan79/py2bit to /tmp/pip-req-build-dshgqw4r
  Running command git clone --filter=blob:none --quiet https://github.com/dpryan79/py2bit /tmp/pip-req-build-dshgqw4r
  Resolved https://github.com/dpryan79/py2bit to commit 697884f1c0b0bf4d3c9b82c4fa6a4445a2453c6c
  Installing build dependencies ... done
  Getting requirements to build wheel ... done
  Preparing metadata (pyproject.toml) ... done
Building wheels for collected packages: py2bit
  Building wheel for py2bit (pyproject.toml) ... done
  Created wheel for py2bit: filename=py2bit-0.3.1-cp312-cp312-linux_x86_64.whl size=45223 sha256=a08579aee53bdcdd36922348267d402815e085c9105ccafe3ac28143d292ba95
  Stored in directory: /tmp/pip-ephem-wheel-cache-95hnl4e1/wheels/9c/e4/d4/d7e82584667f1ccca6b471b4220e3cc1f9f2c0bbfdfc3a1aa4
Successfully built py2bit
Installing collected packages: py2bit
Successfully installed py2bit-0.3.1
(venv) ➜  tests python
Python 3.12.6 (main, Sep 10 2024, 00:05:17) [GCC 11.4.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import py2bit
>>>

osobliwy nick

2024-09-26T10:06:49+02:00

Rejestracja: dni
Ostatnio: dni
Postów: 34

0

DarekRepos

2024-09-26T13:02:45+02:00

Rejestracja: dni
Ostatnio: dni
Postów: 50

0

problem jest pewnie już dawno rozwiązany, i gpt powinie dać rade napisać kod z konwersja tego na postać binarna, ale myślę ze kod wyżej już ma rozwiązanie które można wykorzystać:

Kopiuj

import py2bit

def base_to_bin(x):
    """
    Convert a nucleotide to its bit representation.
    """
    if x == 'T': return '00'
    if x == 'C': return '01'
    if x == 'A': return '10'
    if x == 'G': return '11'
    return '??'  # Unknown nucleotide case


with py2bit.open("hs1.2bit") as tb:
    print(tb.chroms())
    print("\n")
    print(tb.sequence("chr20", 0, 100))
    print("\n")
    sequence=tb.sequence("chr20", 0, 100)

    # Convert the sequence to binary
    binary_sequence = ''.join([base_to_bin(base) for base in sequence])
    print(f'Binary sequence:\n{binary_sequence}')

.GodOfCode.

2024-09-26T23:32:27+02:00

Rejestracja: dni
Ostatnio: dni
Postów: 358

0

Nigdy nie widziałem takiego krótkiego DNA

osobliwy nick

2024-09-27T04:52:47+02:00

Rejestracja: dni
Ostatnio: dni
Postów: 34

0

.GodOfCode. napisał(a):

Nigdy nie widziałem takiego krótkiego DNA

To tylko fragmenty. Ogólnie jest tam 25 chromosomów, w każdym po kilkaset zasad.

Kopiuj

{'chr1': 248387328, 'chr2': 242696752, 'chr3': 201105948, 'chr4': 193574945, 'chr5': 182045439, 'chr6': 172126628, 'chr7': 160567428, 'chrX': 154259566, 'chr9': 150617247, 'chr8': 146259331, 'chr11': 135127769, 'chr10': 134758134, 'chr12': 133324548, 'chr13': 113566686, 'chr14': 101161492, 'chr15': 99753195, 'chr16': 96330374, 'chr17': 84276897, 'chr18': 80542538, 'chr20': 66210255, 'chrY': 62460029, 'chr19': 61707364, 'chr22': 51324926, 'chr21': 45090682, 'chrM': 16569}