Array itteration speed up + other tips needed

moogie · February 18, 2004, 9:25pm

I am wondering what programmers can do to speed up their programs.

according to a profiling tool, more than 45% of the CPU time is spent in one method:


            // encodes a given pixel at point (w,h)
            final public void encode(int w,int h,int k,int width,byte[][] frame, boolean verbose,boolean[][] ODCodes,boolean [][] IDCodes,boolean[][] huffCodes, BitOutputStream controlW,BitOutputStream IDW,BitOutputStream ODW,BitOutputStream CCW) throws IOException
            {
                  j=h*width+w;
                  currentPixelRed=frame[j][0]; 
                  currentPixelGreen=frame[j][1]; 
                  currentPixelBlue=frame[j][2]; 
                  

                  if (k>-1)
                  {
                        comparatorPixelRed=frame[k][0]; 
                        comparatorPixelGreen=frame[k][1]; 
                        comparatorPixelBlue=frame[k][2]; 
                        Rdiff=256+((currentPixelRed & 0xFF)-(comparatorPixelRed & 0xFF));
                        Gdiff=256+((currentPixelGreen & 0xFF)-(comparatorPixelGreen & 0xFF));
                        Bdiff=256+((currentPixelBlue & 0xFF)-(comparatorPixelBlue & 0xFF));
                  }
                  

                  innerBitCount=0;
                  // Determine the number of bits compressing via the InnerDifferences or saving Colour Component will give.
                        
                  innerBitCount+=huffCodes[((int)(currentPixelRed) & 0xff)].length+1;
                        
                  RGdiff= 256+((currentPixelGreen & 0xFF)-(currentPixelRed & 0xFF));
                  if (IDCodes[RGdiff].length>0 && IDCodes[RGdiff].length<huffCodes[(currentPixelGreen & 0xFF)].length)
                  {
                        innerBitCount+=IDCodes[RGdiff].length+1;
                  }
                  else
                  {
                        innerBitCount+=huffCodes[((int)(currentPixelGreen) & 0xff)].length+1;
                  }
                              
                  RBdiff= 256+((currentPixelBlue & 0xFF)-(currentPixelRed & 0xFF));
                  if (IDCodes[RBdiff].length>0 && IDCodes[RBdiff].length<huffCodes[(currentPixelBlue & 0xFF)].length)
                  {
                        innerBitCount+=IDCodes[RGdiff].length+1;
                  }
                  else
                  {
                        innerBitCount+=huffCodes[((int)(currentPixelBlue) & 0xff)].length+1;
                  }
                  
                  // if the pixel we are comparing the current pixel to is in range and all the RGB components of the current pixel are in
                  // the Outer Differences and coding using the OuterDifferences yields better compression than via the InnerDifferences write.write the OD codes.
                  if (k>-1 && ODCodes[Rdiff].length>0 && ODCodes[Gdiff].length>0 && ODCodes[Bdiff].length>0 && (innerBitCount>ODCodes[Rdiff].length+ODCodes[Gdiff].length+ODCodes[Bdiff].length+1))
                  {
                              
                        controlW.value=controlW.value<<1 |  bitMask; controlW.count++;
                        controlW.addToBuffer();

                        for (int i=0;i<ODCodes[Rdiff].length;i++)
                        {
                              ODW.value=ODW.value<<1; ODW.count++;
                              if (ODCodes[Rdiff][i]) ODW.value=ODW.value| bitMask;
                              ODW.addToBuffer();
                        }
                        
                        for (int i=0;i<ODCodes[Gdiff].length;i++)
                        {
                              ODW.value=ODW.value<<1; ODW.count++;
                              if (ODCodes[Gdiff][i]) ODW.value=ODW.value| bitMask;
                              ODW.addToBuffer();
                        }
                        
                        for (int i=0;i<ODCodes[Bdiff].length;i++)
                        {
                              ODW.value=ODW.value<<1; ODW.count++;
                              if (ODCodes[Bdiff][i]) ODW.value=ODW.value| bitMask;
                              ODW.addToBuffer();
                        }
                  }
                  else
                  { 
                  
                        // set Red component to its huffman code
                        red=((int)(currentPixelRed) & 0xff);

                        controlW.value=controlW.value<<1; 
                        controlW.count++;
                        controlW.addToBuffer();

                              
                        for (int i=0;i<huffCodes[red].length;i++)
                        {
                              CCW.value=CCW.value<<1; CCW.count++;
                              if (huffCodes[red][i]) CCW.value=CCW.value| bitMask;
                              CCW.addToBuffer();
                        }
            
                              
                        // if the Red-Green Difference is in the Inner Differences encode the component using Inner Differences
                        // unless the Green component huffman code is less than the difference code in which case the Green component huffman code is written.
                              
                        
                        RGdiff= 256+((currentPixelGreen & 0xFF)-(currentPixelRed & 0xFF));
                        if (IDCodes[RGdiff].length>0 && IDCodes[RGdiff].length<huffCodes[(currentPixelGreen & 0xFF)].length)
                        {
                              controlW.value=controlW.value<<1 |  bitMask; controlW.count++;
                              controlW.addToBuffer();
                              
                              for (int i=0;i<IDCodes[RGdiff].length;i++)
                              {
                                    IDW.value=IDW.value<<1; IDW.count++;
                                    if (IDCodes[RGdiff][i]) IDW.value=IDW.value| bitMask;
                                    IDW.addToBuffer();
                              }
                        }
                        else
                        {
                              controlW.value=controlW.value<<1; controlW.count++;
                              controlW.addToBuffer();
                              for (int i=0;i<huffCodes[(currentPixelGreen & 0xFF)].length;i++)
                              {
                                    CCW.value=CCW.value<<1; CCW.count++;
                                    if (huffCodes[(currentPixelGreen & 0xFF)][i]) CCW.value=CCW.value| bitMask;
                                    CCW.addToBuffer();
                              }
                                    
                        }
                              
                        // if the Red-Blue Difference is in the Inner Differences encode the component using Inner Differences
                        // unless the Blue component huffman code is less than the difference code in which case the Blue component huffman code is written.

                        RBdiff= 256+((currentPixelBlue & 0xFF)-(currentPixelRed & 0xFF));
                        if (IDCodes[RBdiff].length >0 && IDCodes[RBdiff].length<huffCodes[(currentPixelBlue & 0xFF)].length)
                        {
                              controlW.value=controlW.value<<1 |  bitMask; controlW.count++;
                              controlW.addToBuffer();
                              
                              for (int i=0;i<IDCodes[RBdiff].length;i++)
                              {
                                    IDW.value=IDW.value<<1; IDW.count++;
                                    if (IDCodes[RBdiff][i]) IDW.value=IDW.value| bitMask;
                                    IDW.addToBuffer();
                              }
                        }
                        else
                        {
                              controlW.value=controlW.value<<1; controlW.count++;
                              controlW.addToBuffer();
                              for (int i=0;i<huffCodes[(currentPixelBlue & 0xFF)].length;i++)
                              {
                                    CCW.value=CCW.value<<1; CCW.count++;
                                    if (huffCodes[(currentPixelBlue & 0xFF)][i]) CCW.value=CCW.value| bitMask;
                                    CCW.addToBuffer();
                              }
                        }
                        
                  }

This method is called over 300000 times, depending on the frame’s size, and is called by the following block of code:



                        ///// encode pixels /////
                        
                        // Generate Outer Differences
      
                        // encode Pixels using Scan Method 1
                        if (mode==1)
                        {
                        
                              prev=-1;
                              for (h=0;h<height;h++)
                              {
                                    for (w=0;w<width;w++)
                                    {
                                          encode.encode(w,h,h*width+w+prev,width,frame,verbose,ODCodes,IDCodes,huffCodes,controlW,IDW,ODW,CCW);
                                    }
                                    
                              }
                        }
                        // encode Pixels using Scan Method 2
                        else if (mode==2)
                        {
      
                              prev=-width;
                        
                              for (w=0;w<width;w++)
                              {
                                    for (h=0;h<height;h++)
                                    {
                                          encode.encode(w,h,h*width+w+prev,width,frame,verbose,ODCodes,IDCodes,huffCodes,controlW,IDW,ODW,CCW);
                                    }
                              }
                        }
                        // encode Pixels using Scan Method 3
                        else if (mode==3)
                        {
                        
                              prev=-width;
                              for (h=0;h<height;h++)
                              {
                                    for (w=0;w<width;w++)
                                    {
                                          encode.encode(w,h,h*width+w+prev,width,frame,verbose,ODCodes,IDCodes,huffCodes,controlW,IDW,ODW,CCW);
                                    }
                                    
                              }
                        }
                        // encode Pixels using Scan Method 4
                        else 
                        {
                        
                              for (h=0;h<height;h++)
                              {
                                    encode.encode(0,h,-1,width,frame,verbose,ODCodes,IDCodes,huffCodes,controlW,IDW,ODW,CCW);
                              }
                        
                              prev=-1;
                              for (w=1;w<width;w++)
                              {
                                    
                                    for (h=0;h<height;h++)
                                    {
                                          encode.encode(w,h,h*width+w+prev,width,frame,verbose,ODCodes,IDCodes,huffCodes,controlW,IDW,ODW,CCW);
                                    }
                                    
                              }
                        }

As you can see there are four different ‘scan modes’ which dictate how the frame is scanned.
I have read that calling a method in java is much more expensive than in C/C++ I am wondering whether there is a pattern that allows me to make four classes? that basically have the same functionalitly (with out the encode method call, i.e. inline ) but differ on how the frame is scanned? And do do this without uncessary replication of the code?

Also, i would love to hear how c/c++ programs are able to quickly itterate over an array… for example in video codecs… how are they able to itterate over the pixels of a frames 30 times a second? i can bearly perform 1 time per second.

Any help is very much appreciated!

Jeff · February 19, 2004, 1:58am

[quote]I have read that calling a method in java is much more expensive than in C/C++
[/quote]
Which hopefully will teach you not to believe everything you read.

Thats errant nonsense.

Sounds like you want tor efactor into a method with an invocation of an obejct method at the cneter of the loop. Depending on what obejct is invoked that will rsult in different code at the center. Ofcourse that will defeat inlining at the center of the loop so if this is your bottleneck it may be better to have seperate, if redundant, code paths.

Soudns more like your java code is messed up. Thatsa lot of Java code to wade through though, coudl you pare your exampel down to a minimal case?

I used to code maximally fast C code. for video drivers You dont deference arrays if you want that. You set a pointer to the start of the data and auto-increment it.

You cannot walk a pointer in Java. Luckily though Java is smarter then C. (Assuming this is Java2 and not J2ME) It will optimzize for you in ways C cannot. The biggest key to looping over arrays in Java2 is that you make the loop clear to the compiler. If the end points of a loopa re predictable then it will elminate all the bounds checking except at the end points. If they are not the you will get bounds checks on every array access and drop to about half speed.

Edit: Tooka quick look. Jeez this is a mess. It looks to me almost like you are looping outside the call but fetching the indvidual pixels seperately inside of each call. I suspect you are confusing the poor optimzier to the point where it is doing bounds cehcks for every pixel. Simplify your code. Al LEAST do your pixel access in the actual loop thats scanning the image and pass in the pixel valeus rather then doing what is effectively unpredictable array access in every call.

BTW this would SUCK in C too because C, beign worse then java abotu inlining, would likely be doing function calls for every damn pixel.

moogie · February 19, 2004, 2:45am

ah, thats good to hear… tho in the benchmarks i have seen java does lag a bit… but you believe that it is probably not the cause?

That is what i thought i might have to do… it will be annoying to maintain the code but speed is an issue.

How would one let the interpreter know that the end points of the loops are ‘predictable’?

Yes, that is what i am doing as it is the only way i know to achieve the functionality without replicated code in different paths. But i may just replicate the code so this issue should hopefully dissappear.

I am not very knowledgable about how the optimizer works so i did not know that my pixel array access inside the method would cause confusion to the optimizer. I will put it on the outside.

Thanks for the tips!

Jeff · February 19, 2004, 3:04am

My pleasure. Keep me up to date on your progress and I’ll try to help as I can and have the time. (Im incredibly busy the next 4 weeks but ill TRY to keep an eye on this topic.)

moogie · February 19, 2004, 3:29am

I will modify the code to have seperate paths of replicated code in the next few hourse and see how the performance chages.

I’ll post the results here.

If you have not gathered by the code i have posted, i am/have developed a lossless video codec. At the moment it is extreamly slow, my aim is to optimize the code such that on play back it can peform the standard 30 frames per second at 720x480. At the moment i am lucky to achieve 2 frames per second.

I am at a stage that optimization is needed before i go and try to improve the compression of my codec by useing different colour spaces and motion estimatiton.

If you are interested i can send the full source, It does have a decent amount of commenting

moogie · February 19, 2004, 5:43am

Here are the results:



Old build (as listed above)   
Sun jdk 1.4.01:  409 msec per frame
JET executable:  265 msec per frame

New build (with seperate exec paths)
Sun jdk 1.4.01:  404 msec per frame
JET executable:  273 msec per frame

The time was the average time per frame for the compression of a sequence of 25 frames repeated 5 times.

From this i can think of three reasons why nothing has changed:

I somehow incorrectly implemented the seperate code execution paths
The sun interpreter is optimizing the two versions the same
I have misstaken the bottleneck.

I am wondering how i might go about determining which of these reasons is correct, or some, or all or none of them are.

ta!

rreyelts · February 19, 2004, 12:37pm

[quote]From this i can think of three reasons why nothing has changed:

I somehow incorrectly implemented the seperate code execution paths
The sun interpreter is optimizing the two versions the same
I have misstaken the bottleneck.

I am wondering how i might go about determining which of these reasons is correct, or some, or all or none of them are.
[/quote]
Well, at a glance, I can see a few bad things that you are doing.

You’re using 2D arrays. Generally, you’re better off with 1D arrays. Especially for cases like your “frame”. That should not be a 2D byte array. It should just be a simple int[] with the RGB values packed into a single int.
Redundant bit-masking. It looks like you are constantly re-doing conversions from signed bytes to unsigned ints. You should just read the rgb values from your (new) packed int array as ints.
Jeff’s point should hold too. To the optimizer, your “j” index is going to be a random value into the array. You’re incurring an array bounds check every single time you access any of your arrays.

I imagine that there are other optimizations, low-level and algorithmic, that can be applied to your code, but in it’s current state, it’s too obfuscated for me to spend any more time looking at it.

God bless,
-Toby Reyelts

mthornton · February 19, 2004, 12:46pm

I would suggest that you use a standard compression format and the new imageio package. It is easy to use and the low level optimisation is someone elses problem.

Jeff · February 19, 2004, 2:51pm

best ways to determine a bottleneck is with a profiler. If you think you maybe be misinterpreting what its telling you show us the output and maybe we can help.

rreyelts · February 19, 2004, 3:29pm

[quote]best ways to determine a bottleneck is with a profiler. If you think you maybe be misinterpreting what its telling you show us the output and maybe we can help.
[/quote]
He’s essentially already done that - he’s pointing to a single function that’s eating up all of his cpu and wondering how he can make that function faster. People can point out some micro-inefficiencies (ala array-bounds checks), but he should clean up his code some so people can point out any obvious algorithmic inefficiencies he has.

God bless,
-Toby Reyelts

Jeff · February 19, 2004, 4:26pm

Hey TR.

He actually brought up the issue of misinterpreting his profile (look above), I was suggesting how he could confirm/deny that.

b0b0b0b · February 19, 2004, 4:27pm

One suspicious looking thing here to me is:
BitOutputStream.addToBuffer() since I can’t see what it’s doing.

Looks like you’re doing a lot of this stuff too:


    for (int i=0;i<huffCodes[red].length;i++) 
    { 
     CCW.value=CCW.value<<1; CCW.count++; 
     if (huffCodes[red][i]) CCW.value=CCW.value| bitMask; 
     CCW.addToBuffer();
    }

I think someone said this or something similar, but you don’t want 2d boolean arrays.

An optimization would be to use 2d int arrays like this, but less lame:




assert huffCodes.length > 0 && (32 * (huffCodes.length-1) >= huffCodes[0]);
    for (int i=1;i<huffCodes[red].length-1;i++) 
    { 
     CCW.value = huffCodes[i];
     CCW.count += 32;
     CCW.addToBuffer();
    }
    if (huffCodes[red][0] > 0) {
     CCW.value = huffCodes[red][1];
     CCW.count += huffCodes[red][0];
     CCW.addToBuffer();
    }

rreyelts · February 19, 2004, 4:52pm

[quote]I would suggest that you use a standard compression format and the new imageio package. It is easy to use and the low level optimisation is someone elses problem.
[/quote]
I don’t think the imageio package has video codecs (i.e. something that is capable of inter-frame compression). Aside from that, the only lossless image format that imageio supports (OOTB) is PNG. I’m not sure that is appropriate.

Just out of curiosity, though, what do you plan on using the codec for? There are few applications that wouldn’t be better off with a lossy codec. (Medical imaging is a good example of one of them).

God bless,
-Toby Reyelts

mthornton · February 19, 2004, 5:43pm

On the images I compress the size difference between JPEG and PNG is small.

rreyelts · February 19, 2004, 6:16pm

I don’t understand your point. Are you trying to imply that there is no reason to use lossy encoding?

You must be using a very high quality level for JPEG or be working with abstract images. The compression used for PNG is deflate (the same thing you see in gzip) which performs nowhere near the compression on rounded DCTs in lossy JPEG.

Don’t even get me started about the differences you’d see with JPEG2000. The trade off you spend in CPU for the wavelet-based compression pays off in spades and can be fairly minimized due to its amenability to SIMD operations.

God bless,
-Toby Reyelts

moogie · February 19, 2004, 6:37pm

hmm, i didnt know that 2D array access was any more expensive than a 1D array Access…

Yeah, i noticed this almost imediately after i posted the code, i had changed it such that the conversion only occurs once at the beginning.

However if there was any speed increase doing this… it seems to be negligable.


                              j=h*width+w;
                              currentPixelRed=frame[j][0] & 0xFF; 
                              currentPixelGreen=frame[j][1] & 0xFF; 
                              currentPixelBlue=frame[j][2] & 0xFF; 
                              
            
                              if (k>-1)
                              {
                                    comparatorPixelRed=frame[k][0]& 0xFF; 
                                    comparatorPixelGreen=frame[k][1]& 0xFF; 
                                    comparatorPixelBlue=frame[k][2]& 0xFF; 
                                    Rdiff=256+currentPixelRed-comparatorPixelRed;
                                    Gdiff=256+currentPixelGreen-comparatorPixelGreen;
                                    Bdiff=256+currentPixelBlue -comparatorPixelBlue;
                              }

how do i let the optimizer know that j is never out of bounds?

I have not put any effort into obsfuscating the code… or is it too hard to read due to bad programming practices? please tell me so i can learn.

Cheers for your input

moogie · February 19, 2004, 6:41pm

[quote]I would suggest that you use a standard compression format and the new imageio package. It is easy to use and the low level optimisation is someone elses problem.
[/quote]
The whole point is that i am trying to make my own… for kicks

All the lossless compressors i have seen do not compress as much as they could, i.e. do not use motion estimation or other colourspaces.

I want to see how much lossless compression I can get using my own techniques, which are based on heavily extended work by Tadas Remencius.

moogie · February 19, 2004, 6:43pm

[quote]best ways to determine a bottleneck is with a profiler. If you think you maybe be misinterpreting what its telling you show us the output and maybe we can help.
[/quote]
I had used Jprofiler… I will workout how to save the output of the logger and then post the results in the next few hours.

moogie · February 19, 2004, 6:45pm

How can i do that? i believe i have cleaned it as much as i can… i will need pointers on how to futher clean the code.

I will post a link to the full source when in a few hours.

moogie · February 19, 2004, 6:55pm

[quote]One suspicious looking thing here to me is:
BitOutputStream.addToBuffer() since I can’t see what it’s doing.

Looks like you’re doing a lot of this stuff too:


    for (int i=0;i<huffCodes[red].length;i++) 
    { 
     CCW.value=CCW.value<<1; CCW.count++; 
     if (huffCodes[red][i]) CCW.value=CCW.value| bitMask; 
     CCW.addToBuffer();
    }

I think someone said this or something similar, but you don’t want 2d boolean arrays.

An optimization would be to use 2d int arrays like this, but less lame:




assert huffCodes.length > 0 && (32 * (huffCodes.length-1) >= huffCodes[0]);
    for (int i=1;i<huffCodes[red].length-1;i++) 
    { 
     CCW.value = huffCodes[i];
     CCW.count += 32;
     CCW.addToBuffer();
    }
    if (huffCodes[red][0] > 0) {
     CCW.value = huffCodes[red][1];
     CCW.count += huffCodes[red][0];
     CCW.addToBuffer();
    }

[/quote]
The problem i can see from your implementation is that i will still need to get the individual bits out of the value as the code may only be 1 or two bits but we are adding the full 32 bits so we would need to substract the other bits. I am not sure whether it is gain anything. But it is owrth thinking about!

Here is the BitOutputStream utility class i have made.



import java.io.*;

public class BitOutputStream
      {
            int value;                        // The byte in which the encoded bits are firstly stored.
            int count;                        // The number of bits written into value.
            byte[] buffer;                  // A byte buffer which is filled with 'value' each time value is full. Used for wirting to file.
            int buffCount;                  // The current number of 'values' written into the buffer.
            int masterCount;            // The overall count of bits that have been written
            OutputStream fos;
            
            public BitOutputStream(OutputStream fos1)
            {
                  fos=fos1;
                  value=0;
                  count=0;
                  buffer=new byte[4096];
                  buffCount=0;
                  masterCount=0;
            }
            
            // Writes the passed value (temp) to the file using the given number of bits
            public void write(int temp,int bits) throws IOException
            {

      

                        for (int j = 0, mask = 1; j < bits; j++, mask <<= 1)
                        {  
                              value=value<<1;count++;
                              if  ((temp & mask) > 0)
                              {
                                    value=value|0x01;
                              }
                              addToBuffer();
                        }
            }
      
            // adds bits stored in 'value' to a buffer which will be saved to a file
            // if the current bit count since last storing into the buffer is less than 8 then return without adding it to the buffer
            public void addToBuffer() throws IOException
            {
                  masterCount++;
            
                  if (count<8) return;

                  //byte temp=(byte) (value);
                  buffer[buffCount]=(byte) (value);;
                  buffCount++;
            
                  if (buffCount==buffer.length)
                  {
                        fos.write(buffer,0,buffCount);
                        buffCount=0;
                  }
                  value=0;
                  count=0;
            }
            
            public void write(byte[] b) throws IOException
            {
                  flush();
                  fos.write(b);
            }
            
            public void flush() throws IOException
            {
                  // pad out the last byte if necessary
                  
                  if (count>0)
                  {
                        value=value<<(8-count);
                        count=8;
                        addToBuffer();
                  }
                  if (buffCount>0)
                  {
                        
                        fos.write(buffer,0,buffCount);
                  }
                  buffCount=0;
            }
            
            public void close() throws IOException
            {
                  flush();
                  fos.close();
            }
      }