Problem
Given: fasta 형식으로 이루어진 대략 1000 길이의 DNA 서열과, 서열의 intron 서열
Return: S의 exon부분이 전사 된 단백질 서열
Sample Dataset
>Rosalind_10
ATGGTCTACATAGCTGACAAACAGCACGTAGCAATCGGTCGAATCTCGAGAGGCATATGGTCACATGATCGGTCGAGCGTGTTTCAAAGTTTGCGCCTAG
>Rosalind_12
ATCGGTCGAA
>Rosalind_15
ATCGGTCGAGCGTGT
Sample Output
MVYIADKQHVASREAYGHMFKVCA
예제 데이터 풀이
1. RNA서열에서 Intron 서열을 제외한다.
2. 남은 Exon서열을 protein으로 번역한다.
풀이
## 1. 파일 읽기
import re, sys, os
# 2. Import sequence
with open('./rosalind_splc.txt', 'r') as f :
lines = f.readlines()
# 3. seq to dictionary
id_pattern = re.compile('>(\S+)')
Seq_d = {}
for line in lines:
outh = id_pattern.search(line)
if outh:
Seq_id=outh.group(1)
else:
outl=line
if(Seq_id in Seq_d.keys()):
Seq_d[Seq_id] += line.replace('\n', '')
else:
Seq_d[Seq_id] =line.replace('\n', '')
# 4. codon to amino
amino = {'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
'TAC':'Y', 'TAT':'Y', 'TAA':'', 'TAG':'',
'TGC':'C', 'TGT':'C', 'TGA':'', 'TGG':'W' }
5. RNA splicing
def RNA_splicing(Seq_d):
i= 0
for seq in Seq_d.values() :
if i == 0 :
i = i+1
RNA = seq
else :
RNA = RNA.replace(seq, '')
return RNA
# 6. Exon rna to protein
def protein_print(RNA, amino):
for i in range(0, len(RNA), 3):
codon = RNA[i:(i + 3)]
am = amino.get(codon, '')
if am:
print(am, end="")
else:
print("Stop")
break
# 7. Run
RNA = RNA_splicing(Seq_d)
protein_print(RNA, amino)
다른 솔루션 by Laperche Sylvain
#!/usr/bin/env python3
from re import sub
from RNA import rna_transcription
from PROT import translate
def splicing(dna, introns):
pattern = '|'.join([ '(' + intron + ')' for intron in introns ])
return translate(rna_transcription(sub(pattern, '', dna)))
if __name__ == '__main__':
with open('SPLC.dataset', encoding='utf-8') as dataset:
dna = dataset.readline().rstrip()
introns = [ intron.rstrip() for intron in dataset.readlines() ]
print(splicing(dna, introns))
반응형