'''
This takes a bedfile and a snparray data file and outputs the
ratio between the median intensity for the bed regions divided
by the median intensity for genome background (i.e., non bed regions)
'''
chrom = ['chr'+str(i) for i in range(1,23)]+['chrX']
bed_positions = {chromosome:0 for chromosome in chrom}
filename= 'lads_galaxy_hg19_mapped.bed'
bedfile = open(filename, 'r')
for line in bedfile:
temp = line.split('\t')
start = int(temp[1])
end = int(temp[2])
chromosome = temp[0]
assert chromosome in bed_positions
bed_positions[chromosome] += end - start + 1
bedfile.close() # do not forget to close the file explicitly or use the with construct
sum = 0
for chromosome in bed_positions:
n = bed_positions[chromosome]
print chromosome, n
sum += n
print 'sum', sum
c:\tmp\___python\dfernan\Q_27668375>python a.py
chrX 78751464
chr13 35025843
chr12 46518464
chr11 55813063
chr10 51281011
chr17 16289483
chr16 32767186
chr15 24089858
chr14 38560293
chr19 7700805
chr18 35824641
chr22 5908395
chr20 23604355
chr21 9100296
chr7 65906044
chr6 67765901
chr5 79127419
chr4 99751548
chr3 73996343
chr2 95936291
chr1 77870888
chr9 43333020
chr8 65489675
sum 1130412286
'''
This takes a bedfile and a snparray data file and outputs the
ratio between the median intensity for the bed regions divided
by the median intensity for genome background (i.e., non bed regions)
'''
def loadBedPositions(fname):
chrom = ['chr'+str(i) for i in range(1,23)]+['chrX']
bed_positions = {chromosome: [] for chromosome in chrom}
bedfile = open(fname, 'r')
for line in bedfile:
temp = line.split('\t')
start = int(temp[1])
end = int(temp[2])
chromosome = temp[0]
assert chromosome in bed_positions
bed_positions[chromosome].append( (start, end) )
bedfile.close() # do not forget to close the file explicitly or use the with construct
return bed_positions
def medianValue(lst):
def myCmp(a, b):
return cmp(a[0], b[0])
# The length of the sequence is the sum of lengths of subsequences (intervals).
seqlen = 0
for start, end in lst:
seqlen += end - start + 1
# Index of the median element.
mi = seqlen // 2
# Skip the subsequences that do not cross the median point, and then get
# the median out of the next subsequence
n = 0 # init -- number of elements in previous sequences
for start, end in sorted(lst, myCmp):
x = end - start + 1 # length of the subsequence
if n + x > mi: # subsequence contains the middle element
di = mi - n # index of the middle elem inside the subsequence
return start + di - 1 # value of the median
else:
n += x
if __name__ == '__main__':
bed_positions = loadBedPositions('lads_galaxy_hg19_mapped.bed')
for chromosome in bed_positions:
print chromosome, medianValue(bed_positions[chromosome])
c:\tmp\___python\dfernan\Q_27668375>python b.py
chrX 94388829
chr13 70206810
chr12 66997615
chr11 83347289
chr10 83912730
chr17 50310741
chr16 52344135
chr15 60290770
chr14 49523046
chr19 33042562
chr18 42711731
chr22 26715724
chr20 21452517
chr21 25053830
chr7 87779768
chr6 90178045
chr5 94652023
chr4 91913307
chr3 85084133
chr2 127283853
chr1 119451935
chr9 76428582
chr8 67100249
If you are experiencing a similar issue, please ask a related question
Title | # Comments | Views | Activity |
---|---|---|---|
Can't install scipy on Python 2.7 in Win7 | 2 | 220 | |
looping | 11 | 37 | |
python pyusb how to find a file on mass storage | 3 | 86 | |
Python Assistance | 7 | 81 |
Join the community of 500,000 technology professionals and ask your questions.