Link to home
Start Free TrialLog in
Avatar of Develprog
Develprog

asked on

Python code optimization for big list through built-in functions or for loop ?

Hi,

How manage with big size of list, my program look in range of 2 list entrance and leave
to display it on a time base plot. The contain of the lists are in millisecond.

I think the problem is in loop for i in X range in function Y and Ynd that take more cpu processing, How to optimize , is there built-ins functions that work better with huge range or other ?

so here is my code:
import pprint
 
import xml.dom.minidom
from xml.dom.minidom import Node

import sys

from matplotlib import pyplot
import time


def inresumegt(i):
	if i in Xresume_gt:
	  #print "OK, RANGE VALUE ",i," MATCHED TO GT RANGE"
	  return True
	else:
	  #print "NOK, RANGE VALUE ",i," NOT MATCHED TO GT RANGE"
	  return False

def inresumecnd(i):
	if i in Xresume_nd:
	  #print "OK, RANGE VALUE ",i," MATCHED TO GT RANGE"
	  return True
	else:
	  #print "NOK, RANGE VALUE ",i," NOT MATCHED TO GT RANGE"
	  return False
 

listent=[150,350,540,1000,2000,3000,4000]
listleav=[300,420,800,1900,2600,3500,5000]
liststart=[100,350,1500,2000,3500,4100]
listend=[170,400,1800,2400,300,4900]

X = range(0,5002) # listleav+2

elapsed = time.time()	
cpu = time.clock()
Xresume_gt=[]
for (gtms_in, gtms_out) in zip(listent, listleav):
    Xresume_gt.extend(range(gtms_in, gtms_out+1))
print "Duration for Xresume_gt\n",time.time() - elapsed
print time.clock() - cpu

elapsed = time.time()	
cpu = time.clock()
Xresume_nd=[]
for (gtms_in, gtms_out) in zip(liststart, listend):
    Xresume_nd.extend(range(gtms_in, gtms_out+1))
print "Duration for Xresume_nd\n",time.time() - elapsed
print time.clock() - cpu

	#1 if ingt(i) else 0 for i in X
elapsed = time.time()	
cpu = time.clock()
Y = [ 1 if inresumegt(i) else 0 for i in X ]
print "Duration for Y\n",time.time() - elapsed
print time.clock() - cpu

elapsed = time.time()	
cpu = time.clock()
Ynd = [  1 if inresumecnd(i) else 0 for i in X ]
print "Duration for Ynd\n",time.time() - elapsed
print time.clock() - cpu


pyplot.ylim(0,2) 
pyplot.plot( X, Y, '-', X, Ynd, 'r' )
pyplot.show()

Open in new window


my output is :

Duration for Xresume_gt
0.0
0.000732961343475
Duration for Xresume_nd
0.0
0.000517897755999
Duration for Y
0.325000047684
0.324799911306
Duration for Ynd
0.194000005722
0.194040736842

The problem is if my lists has more big values , for what concern the last value of 'listleav' if it is big (range X will increase too)
(e.g.  5506800) see below:
 enterent="579800" leave="584400"
 enterent="605600" leave="608600"
 enterent="3010800" leave="3125400"
 enterent="3010800" leave="3047600"
 enterent="3060600" leave="3125400"
 enterent="3544000" leave="3577400"
 enterent="3544400" leave="3569600"
 enterent="3633400" leave="3634200"
 enterent="5412600" leave="5422800"
 enterent="5482600" leave="5506800"


Thanks
ASKER CERTIFIED SOLUTION
Avatar of gelonida
gelonida
Flag of France image

Link to home
membership
This solution is only available to members.
To access this solution, you must be a member of Experts Exchange.
Start Free Trial
Just another thought.
Sometimes it's not obvious to know where the time is spent
end even experienced programmers do often micro-optimize
the wrong places of their code.

Often python profiling can help you.

Example:
#!/usr/bin/env python
import pprint
 
import xml.dom.minidom
from xml.dom.minidom import Node

import sys

from matplotlib import pyplot
import time


def inresumegt(i):
	if i in Xresume_gt:
	  #print "OK, RANGE VALUE ",i," MATCHED TO GT RANGE"
	  return True
	else:
	  #print "NOK, RANGE VALUE ",i," NOT MATCHED TO GT RANGE"
	  return False

def inresumecnd(i):
	if i in Xresume_nd:
	  #print "OK, RANGE VALUE ",i," MATCHED TO GT RANGE"
	  return True
	else:
	  #print "NOK, RANGE VALUE ",i," NOT MATCHED TO GT RANGE"
	  return False
 

def main():
    global Xresume_gt
    global Xresume_nd
    
    listent=[150,350,540,1000,2000,3000,4000]
    listleav=[300,420,800,1900,2600,3500,5000]
    liststart=[100,350,1500,2000,3500,4100]
    listend=[170,400,1800,2400,300,4900]
    
    X = range(0,5002) # listleav+2
    
    elapsed = time.time()	
    cpu = time.clock()
    Xresume_gt=[]
    for (gtms_in, gtms_out) in zip(listent, listleav):
        Xresume_gt.extend(range(gtms_in, gtms_out+1))
    print "Duration for Xresume_gt\n",time.time() - elapsed
    print time.clock() - cpu
    
    elapsed = time.time()	
    cpu = time.clock()
    Xresume_nd=[]
    for (gtms_in, gtms_out) in zip(liststart, listend):
        Xresume_nd.extend(range(gtms_in, gtms_out+1))
    print "Duration for Xresume_nd\n",time.time() - elapsed
    print time.clock() - cpu
    
    	#1 if ingt(i) else 0 for i in X
    elapsed = time.time()	
    cpu = time.clock()
    Y = [ 1 if inresumegt(i) else 0 for i in X ]
    print "Duration for Y\n",time.time() - elapsed
    print time.clock() - cpu
    
    elapsed = time.time()	
    cpu = time.clock()
    Ynd = [  1 if inresumecnd(i) else 0 for i in X ]
    print "Duration for Ynd\n",time.time() - elapsed
    print time.clock() - cpu


import cProfile
cProfile.run('main()', 'mystats.prof')
main()

import pstats
p = pstats.Stats('mystats.prof')
p.strip_dirs().sort_stats(-1).print_stats()

pyplot.show()

Open in new window


The relevant output would be something like:
Sat May 10 20:42:05 2014    mystats.prof

         10052 function calls in 0.228 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.228    0.228 <string>:1(<module>)
     5002    0.140    0.000    0.140    0.000 bla.py:13(inresumegt)
     5002    0.084    0.000    0.084    0.000 bla.py:21(inresumecnd)
        1    0.003    0.003    0.227    0.227 bla.py:30(main)
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
       13    0.000    0.000    0.000    0.000 {method 'extend' of 'list' objects}
       14    0.000    0.000    0.000    0.000 {range}
        8    0.000    0.000    0.000    0.000 {time.clock}
        8    0.000    0.000    0.000    0.000 {time.time}
        2    0.000    0.000    0.000    0.000 {zip}

Open in new window


and gives you a clear indication which lines consume time.
Avatar of Develprog
Develprog

ASKER

Yes,

By adding set :
 Xresume_gt_set = set(Xresume_gt)) 

Open in new window


before calling  inresumegt and inresumecnd,..
def inresumegt(i):
	if i in Xresume_gt_set:

Open in new window


It gives a super optimization Gelonida thanks!
Very usefull  the python cProfiling module.

What if Xresume_gt and Xresume_gt_set are attributes of a class (e.g:  self.Xresume_gt , self.Xresume_gt_set,...) aren't local variables.

Gives the set function still good optimization, if not what to do?
yes even with class attributes the set function should give good results as it avoids traversing the list for each search.

Y = [ 1 if i in self.Xresume_gt_set else 0 for i in X ]

Open in new window


Some micro-optimisation (I did not check, whether it has any measurable impact, but you can profile yourself ;-)  ), that you could do could be to reduce one attribute lookup in the list expression.

gt_set = self.Xresume_gt_set
Y = [ 1 if i in gt_set else 0 for i in X ]

Open in new window


P.S. I just timed the suggested micro optimisation.
It might give you about 30% speedup in a tight inner loop, at least with my python version

In [9]: class C(object):
    pass
   ...: 

In [10]: c.l  = l = set(range(200))

In [11]: timeit [ (v in l) for v in range(300) ]
10000 loops, best of 3: 22.6 µs per loop

In [12]: timeit [ (v in c.l) for v in range(300) ]
10000 loops, best of 3: 30.1 µs per loop

Open in new window

Ok, so it stays still good, nice.

One step further is to make a list of list
Here I just remark that 'set' cannot manage it, or is there a way to pass it into a set?  So here the piece of code that
gives this error:

    Xresume_gt_listof_set = set(Xresume_gt_listof)
TypeError: unhashable type: 'list'

import pprint
 
import xml.dom.minidom
from xml.dom.minidom import Node

import sys

from matplotlib import pyplot
import time


def inresumegt(i):
	if i in Xresume_gt_set:
	  return True
	else:
	  return False
def inresumegtlistof(i):
	if i in Xresume_gt_listof_set:
	  return True
	else:
	  return False
	  
def inresumecnd(i):
	if i in Xresume_nd_set:
	  #print "OK, RANGE VALUE ",i," MATCHED TO GT RANGE"
	  return True
	else:
	  #print "NOK, RANGE VALUE ",i," NOT MATCHED TO GT RANGE"
	  return False
 
#GT
listent=[150,350,540,1000,2000,3000,4000]
listleav=[300,420,800,1900,2600,3500,5000]
#ND
liststart=[100,350,1500,2000,3500,4100]
listend=[170,400,1800,2400,300,4900]

X = range(0,5002)

#1.GT
elapsed = time.time()	
cpu = time.clock()
'''
Xresume_gt=[]
for (gtms_in, gtms_out) in zip(listent, listleav):
    Xresume_gt.extend(range(gtms_in, gtms_out+1))
'''
size_list_gt = len(listent)
Xresume_gt_listof=[]
Xresume_gt_listof = [[0 for x in xrange(size_list_gt)] for x in xrange(size_list_gt)]
#self.Xresume_gt_each = [[0 for x in xrange(size_list_gt)] for x in xrange(size_list_gt)]
#self.Xresume_gt_eacht

if size_list_gt != 0:
    #here adding list of list because of overlaping in the GT events 
    for ind, (gtms_in, gtms_out) in enumerate(zip(listent,listleav)):
      print "index is ",ind,""
      for i in range(gtms_in, gtms_out):
         Xresume_gt_listof[ind].append(i)
else:
    print "not yet filled time lists of GT"

print "Duration for Xresume_gt\n",time.time() - elapsed
print time.clock() - cpu
Xresume_gt_listof_set = set(Xresume_gt_listof)

#2.ND
elapsed = time.time()	
cpu = time.clock()
Xresume_nd=[]
for (gtms_in, gtms_out) in zip(liststart, listend):
    Xresume_nd.extend(range(gtms_in, gtms_out+1))
print "Duration for Xresume_nd\n",time.time() - elapsed
print time.clock() - cpu
Xresume_nd_set = set(Xresume_nd)

###########################################################
#1.GT MULTI PLOT
elapsed = time.time()	
cpu = time.clock()
'''
Y = [ 1 if inresumegt(i) else 0 for i in X ]
'''
Ylist=[]
for i in xrange(0,size_list_gt-1): 
   Ylist[i] = [ 1 if inresumegtlistof(i) else 0 for i in X ]
print "Duration for Y\n",time.time() - elapsed
print time.clock() - cpu

#2.ND SIMPLE PLOT
elapsed = time.time()	
cpu = time.clock()
Ynd = [  1 if inresumecnd(i) else 0 for i in X ]
print "Duration for Ynd\n",time.time() - elapsed
print time.clock() - cpu

pyplot.ylim(0,2) 
pyplot.plot( X, Y, '-', X, Ynd, 'r' )
pyplot.show()

Open in new window


Thanks
You have a list of lists and want to create a set of all members,  so that you can quickly check membership?

Please see below example.
In [1]: import itertools
In [2]: ll = [ [ vv for vv in xrange(4*v,4*v+3) ] for v in xrange(3) ]
In [3]: s = set(v for v in itertools.chain.from_iterable(ll))
In [4]: ll
Out[4]: [[0, 1, 2], [4, 5, 6], [8, 9, 10]]
In [5]: s
Out[5]: {0, 1, 2, 4, 5, 6, 8, 9, 10}

Open in new window

Thank you Gelonida,

I'll tray it
Hi Gelonida,
Thank you it works more fast.
But unfortunately with 'set' all datas aren't present.
So if you imagine to display severals range of datas which are overlaped then we can have issue
because of some missing informations (e.g. : if we want to display each of these ranges into a plot graph).

Examples:

def inresumegtlistofset(i):  # +/- 30 times fast but does not display all informations (e.g: when datas are overlape,..) 
    if i in Xresume_gt_listof_set:
      return True
    else:
      return False
      
def inresumegtlistoflist(i,ind): # versy slow if wide range data, but display all informartions
    if i in Xresume_gt_listof[ind]:
      return True
    else:
      return False

	  

listent=[150,350,350,1000,2000,2500,4000]
listleav=[300,420,800,1900,2600,3500,5000]


index_size = len(listleav)
last_val = listleav[index_size-1]
X = xrange(0,last_val+2)

size_list_gt = len(listent)
raw_input()
#Xresume_gt_listof=[]
Xresume_gt_listof = [[0 for x in xrange(size_list_gt)] for x in xrange(size_list_gt)] 
Xresume_gt_listof = [[x for x in xrange(listent[t],listleav[t])] for t in (xrange(size_list_gt))]

Xresume_gt_listof_set = set((v for v in itertools.chain.from_iterable(Xresume_gt_listof)))

#1.GT MULTI PLOT
elapsed = time.time()	
cpu = time.clock()

Ylist=[]
Ylist = [[0 for x in xrange(size_list_gt)] for x in xrange(size_list_gt)]
for ind in xrange(0,size_list_gt): 
  #Ylist[ind] = [ 1 if inresumegtlistofset(i) else 0 for i in X ] # with set fast
   Ylist[ind] = [ 1 if inresumegtlistoflist(i,ind) else 0 for i in X ] # with list slow but index can help
print "Duration for Y\n",time.time() - elapsed
print time.clock() - cpu

f1= pyplot.figure("wind")
pyplot.figure("wind")
pyplot.ylim(0,2) 

leng = len(Ylist)
for i in range(0,leng):
   pyplot.plot( X, Ylist[i], '-')
pyplot.show()

Open in new window


Is there a way to fix this display plotting issue , or otherwhise is there another fast way to fix it?

 Thank you
You need double book keeping.
The list for plotting and the set for searching.
So store each value in a list and in a set
I don't get what you mean exactly.
could you explain more pls.
Well,  I do have a lot of pressure at work and not that much time.

will try to look at this later in more detail

But obviously the two functions cannot have identical behaviour.

one function has 1 parameter, the other one two.

one checks for membership in one sub list of a list of lists
the other one searches in the entire set

you have to have a list of sets and check for membership just in the sub set
Hi I'm still quite busy and have onl very limited time

Could you please post the most recent slow and working version and the most recent fast but non working version.

If I have both versions I'm rather sure, that I can point out how to fix it.
Hi gelonida,
i will put codes. Thank you
So here the code as it is +/-20 times fast but does not display all data of Ylist on the plot.
If you comment line 229 and use line 228 instead then it is slow but all data of Ylist are displayed on plot.

#tool to test matplotlib installation
import pprint
 
import xml.dom.minidom
from xml.dom.minidom import Node

import sys

from matplotlib import pyplot
import time

import itertools

def Comparetime(enter,leave):
    diff = leave-enter
    #insec = diff/1000
    return diff
	
def ingt(i):
   for ind, (gtms_in, gtms_out) in enumerate(zip(listent, listleav)):
        if gtms_in <= i <= gtms_out :
           #print "OK, RANGE VALUE ",i," MATCHED TO GT RANGE"
           return True
        else:
           #print "NOK, RANGE VALUE ",i," NOT MATCHED TO GT RANGE"
           #return False
           continue
   return False 

def incnd(i):
   for ind, (gtms_in, gtms_out) in enumerate(zip(liststart, listend)):
        if gtms_in <= i <= gtms_out :
           #print "OK, RANGE VALUE ",i," MATCHED TO GT RANGE"
           return True
        else:
           #print "NOK, RANGE VALUE ",i," NOT MATCHED TO GT RANGE"
           #return False
           continue
   return False 
def Func_disp_gt_allevents():
    size = len(listent)
    print "display gt all events:"
    for (gtms_ent, gtms_outt) in zip(listent, listleav):
        print gtms_ent," -> ",gtms_outt,""
    print "display end!"
    return
def Func_disp_alist_size(thelist):
    size_row= len(thelist)
    print "the row length is",size_row,""
    for i in xrange (size_row):
      size_col = len(thelist[i])
      print "row:",i," the length of col is:",size_col,""
      #for y in xrange (size_col):
         #print thelist[i][y]
    return

def Funcanalyzegt():
    raw_input()  
    for node in Docgt.getElementsByTagName("event"):
       if node.getAttribute("class") == "human":
           entertime = int(node.getAttribute("enterTimeMS"))
           listent.append(entertime)
           print entertime
           
           leavetime = int(node.getAttribute("leaveTimeMS"))
           listleav.append(leavetime)
           print leavetime
           diff = Comparetime (entertime,leavetime)
           listtime.append(diff)
           #ComputeMaxmin(diff)
              
           #loc = get_first_node_val(node, "property")
           #listzone.append(int(loc))
           #print loc.toxml()
           nList = node.getElementsByTagName("properties")
           for nodes in nList:
              print 'current nodes: ',nodes
              eList = nodes.getElementsByTagName("property")
              for e in eList:
                 print 'current sub nodes: ',e
                 if e.hasAttribute("value"):
                    val = int(e.getAttribute("value"))
                    #print e.toxml()
                    print val
                    #raw_input()
                    listzone.append(val)
       else:
           print"event not human class type" 

def Funcanalyzecand():
    if cand_file_input != None:
       for node in Doccand.getElementsByTagName("alarm"):
          starttime = int(node.getAttribute("startTimeMS"))
          liststart.append(starttime)
          print starttime

          endtime = int(node.getAttribute("endTimeMS"))
          listend.append(endtime)
          print endtime
          diff = Comparetime (starttime,endtime)
          listcandtime.append(diff)

       candeventsnb = len(listcandtime)
    else:
       print "canditate file not loaded, cannot process all analyze"

def inresumecnd(i):
    if i in Xresume_nd_set:
      #print "OK, RANGE VALUE ",i," MATCHED TO GT RANGE"
      return True
    else:
      #print "NOK, RANGE VALUE ",i," NOT MATCHED TO GT RANGE"
      return False
   
 

def inresumegt(i):
    if i in Xresume_gt_set:
      return True
    else:
      return False
def inresumegtlistofset(i): # +/- 30 times fast but does not display all informations (e.g: when datas are overlape,..)
    if i in Xresume_gt_listof_set:
      return True
    else:
      return False
      
def inresumegtlistoflist(i,ind): # versy slow if wide range data, but display all informartions
    if i in Xresume_gt_listof[ind]:
      return True
    else:
      return False
	  

#listent=[150,350,540,1000,2000,3000,4000]
#listleav=[300,420,800,1900,2600,3500,5000]
listent=[150,350,350,1000,2000,2500,4000]
listleav=[300,420,800,1900,2600,3500,5000]
liststart=[100,350,1500,2000,3500,4100]
listend=[170,400,1800,2400,300,4900]
#X = range(0,5002)

'''
listent=[]
listleav=[]
liststart=[]
listend=[]
listtime = []
listcandtime =[]

gt_file_input=sys.argv[1]
cand_file_input=sys.argv[2]

Docgt = xml.dom.minidom.parse(gt_file_input)
Doccand =xml.dom.minidom.parse(cand_file_input)

print "Now call gt"
Funcanalyzegt()
print "Now call cand"
Funcanalyzecand()
'''


index_size = len(listleav)
last_val = listleav[index_size-1]
X = xrange(0,last_val+2)
#X = range(0,5506800+2)

#1.GT
elapsed = time.time()	
cpu = time.clock()
'''
Xresume_gt=[]
for (gtms_in, gtms_out) in zip(listent, listleav):
    Xresume_gt.extend(range(gtms_in, gtms_out+1))
'''
size_list_gt = len(listent)
#Func_disp_gt_allevents()
raw_input()
#Xresume_gt_listof=[]
Xresume_gt_listof = [[0 for x in xrange(size_list_gt)] for x in xrange(size_list_gt)] 
Xresume_gt_listof = [[x for x in xrange(listent[t],listleav[t])] for t in (xrange(size_list_gt))]
Func_disp_alist_size(Xresume_gt_listof)
raw_input()


#self.Xresume_gt_each = [[0 for x in xrange(size_list_gt)] for x in xrange(size_list_gt)]
#self.Xresume_gt_eacht

print "Duration for Xresume_gt\n",time.time() - elapsed
print time.clock() - cpu

#FILL:
#Xresume_gt_listof_set = set(Xresume_gt_listof)
'''
for i in xrange(0,size_list_gt-1):
   Xresume_gt_listof_set = set(Xresume_gt_listof[i])
'''
Xresume_gt_listof_set = set((v for v in itertools.chain.from_iterable(Xresume_gt_listof)))
'''
for item in Xresume_gt_listof_set:
   print item
raw_input()
'''

raw_input()

#2.ND
elapsed = time.time()	
cpu = time.clock()
Xresume_nd=[]
for (gtms_in, gtms_out) in zip(liststart, listend):
    Xresume_nd.extend(range(gtms_in, gtms_out+1))
print "Duration for Xresume_nd\n",time.time() - elapsed
print time.clock() - cpu
Xresume_nd_set = set(Xresume_nd)

###########################################################
#1.GT MULTI PLOT
elapsed = time.time()	
cpu = time.clock()
'''
Y = [ 1 if inresumegt(i) else 0 for i in X ]
'''
Ylist=[]
Ylist = [[0 for x in xrange(size_list_gt)] for x in xrange(size_list_gt)]
for ind in xrange(0,size_list_gt): 
  Ylist[ind] = [ 1 if inresumegtlistofset(i) else 0 for i in X ] # with set fast
  #Ylist[ind] = [ 1 if inresumegtlistoflist(i,ind) else 0 for i in X ] # with list slow but index can help
print "Duration for Y\n",time.time() - elapsed
print time.clock() - cpu

#2.ND SIMPLE PLOT
elapsed = time.time()	
cpu = time.clock()
Ynd = [  0.8 if inresumecnd(i) else 0 for i in X ]
print "Duration for Ynd\n",time.time() - elapsed
print time.clock() - cpu

f1= pyplot.figure("mywind")
pyplot.figure("mywind")
pyplot.ylim(0,2) 
#pyplot.plot( X, Y, '-', X, Ynd, 'r' )
leng = len(Ylist)
for i in range(0,leng):
   pyplot.plot( X, Ylist[i], '-')
pyplot.plot( X, Ynd, 'r' )
pyplot.show()

Open in new window


How use set or list to optimize and display all datas of Ylist on plot.

Thanks
change  line 199
from
Xresume_gt_listof_set = set((v for v in itertools.chain.from_iterable(Xresume_gt_listof)))

Open in new window

to
Xresume_gt_listof_set = [ set(sublist) for sublist in Xresume_gt_listof ]

Open in new window


and line 122 - 126
from
def inresumegtlistofset(i): # +/- 30 times fast but does not display all informations (e.g: when datas are overlape,..)
    if i in Xresume_gt_listof_set:
      return True
    else:
      return False

Open in new window


to
def inresumegtlistofset(i, ind): # +/- 30 times fast and should display all informations (e.g: when datas are overlape,..)
    if i in Xresume_gt_listof_set[ind]:
      return True
    else:
      return False

Open in new window


By the way:
The value in iterable statement returns already True / False, so you can rewrite the functionto
def inresumegtlistofset(i, ind): # +/- 30 times fast and should display all informations (e.g: when datas are overlape,..)
    return i in Xresume_gt_listof_set[ind]:

Open in new window

or just remove the function and directly replace
inresumegtlistoflist(i,ind)

Open in new window

with
i in Xresume_gt_listof_set[ind]

Open in new window

in line 228

so that it looks like:
Ylist[ind] = [ 1 if i in Xresume_gt_listof_set[ind] else 0 for i in X ] 

Open in new window

Thank you Gelonida,

It is fast and complete now.