Input file: GPL1440_family.tar.gz
Output file: parsing_gsm_relation.txt, parsing_gsm_err.txt, parsing_gsm_results.txt
import glob import gzip ## Reading GPL Files files_gpl = glob.glob('./GPLRaw/*.soft.gz') ## Recording parsing results files_result = open('parsing_GSM_results.txt', 'w') ## Recording Relation GPL, GSE, GSM files_relation = open('parsing_GSM_relation.txt', 'w') ## Recording parsing errors files_err = open('parsing_GSM_err.txt', 'w') str_err = '' def splicing_line(line): tmp = line.split('=') arr = [] if(tmp[0].startswith('!')): _tmp = tmp[0].lstrip('!') arr.append(_tmp.strip()) else: arr.append(tmp[0].strip()) arr.append(tmp[1].strip()) return arr for afile in files_gpl: gpl_name = afile.replace('./GPLRaw/', '') gpl_name = gpl_name.rstrip('_family.soft.gz') print afile line_err = '' # Opening a gzip file file_gz = gzip.open(afile) # Extracting lines for SAMPLE sample_name = '' flag_sample_start = False for line in file_gz: #print line try: line_err = line if(line.startswith('^SAMPLE')): flag_sample_start = True arr = line.split('=') sample_name = arr[1].strip() #arr = splicing_line(line) if(line.startswith('!sample_table_begin')): flag_sample_start = False if(line.startswith('!sample_table_end')): flag_sample_start = False if(flag_sample_start): if(line.startswith('!')): arr = splicing_line(line) files_result.write(sample_name + '\t' + arr[0] + '\t' + arr[1] + '\n') if(line.startswith('!Sample_series_id')): arr = splicing_line(line) files_relation.write(gpl_name + '\t' + arr[1] + '\t' +sample_name + '\n') except Exception, e: print "Error", line.strip(), '%s' % e files_err.write(line.strip() + '\t' + '%s' % e + '\n') file_gz.close() # Break for Test #break files_err.close() files_relation.close() files_result.close()
'Informatics > Genome Informatics' 카테고리의 다른 글
Genome / dbsnp에서 rsID 추적하기(통합된 것 찾기) (4) | 2011.09.23 |
---|---|
Genome / Genetic Risk (0) | 2011.08.30 |
Genome / 뉴클레오티드, 뉴클레오티드키나아제 (0) | 2011.06.22 |
Genome / SNP Genotype 표 이해하기 (0) | 2011.06.16 |
FASTQ Format (0) | 2011.05.30 |