Here are some results from a nice little test I just ran from some work I was doing a couple years ago when the issue came up of PowerBASIC's speed in comparison to C and C++. The interesting issue at the time was that some MSVC compilations were killing PowerBASIC in the same tests - by about a factor of 10! When Paul Dixon disassembled the VC code he discovered a very interesting thing. The compiler was examining the algorithm and determining it wasn't efficient, and it was re-writing it! In other words - optimization! The asm code generated by the compiler wasn't anything like the PowerBASIC code, which was just translating the sourse 'as is' into machine instructions. So to make the comparison useful John Gleason suggested something more complicated than those little ditties all the compiler writers know about and hone their code against, which results in 'tainted' speed results. Anyway, here's John Gleason's algorithm - slightly modified by me ...
// Exercise
// =======================================
// 1) Create a 2MB string of dashes;
// 2) Change every 7th dash to a "P";
// 3) Replace every "P" with a "PU" (hehehe);
// 4) Replace every dash with an "8";
// 5) Put in a CrLf every 90 characters;
// 6) Output last 4K to Message Box.
I'll shortly post one of my many C++ examples that implement this, but here are my results of 10 runs as follows...
x86 32 bit code
===================================================
32 bit ansi string buffers, i.e., 2,000,000 chars and 2,000,000 bytes 18.6 ticks
32 bit wide string buffers, i.e., 2,000,000 wchars and 4,000,000 bytes 31.5 ticks
x64 64 bit code
===================================================
64 bit ansi string buffers, i.e., 2,000,000 chars and 2,000,000 bytes 28.1 ticks
64 bit wide string buffers, i.e., 2,000,000 wchars and 4,000,000 bytes 45.2 ticks
Narrow x86
===========
31
15
15
16
16
31
16
15
16
15
===
186 186/10 = 18.6 ticks
Wide x86
===========
16
32
31
16
47
32
32
31
47
31
===
315 315/10 = 31.5 ticks
narrow x64
===========
31
15
47
16
16
31
47
31
16
31
===
281 281/10 = 28.1 ticks
wide x64
=========
47
47
47
46
32
46
47
46
47
47
===
452 452/10 = 45.2 ticks
As can be seen above, ansi is faster than wide character, and 32 bit is faster than 64 bit. The slowest is unicode under native 64 bit, and the fastest is ansi in 32 bit mode. I used the MinGW GCC x86/x64 compiler for the x64 compilations, and an older MinGW GCC 32 bit compiler for the 32 bit compiles. Here is the source. To compile for unicode just uncomment the defines at top...
//#ifndef UNICODE
//#define UNICODE //strCls34U.cpp
//#endif
//#ifndef _UNICODE
//#define _UNICODE
//#endif
#include <Windows.h> //for MessageBox(), GetTickCount() and GlobalAlloc()
#include <tchar.h>
#include <String.h> //for strncpy(), strcpy(), strcat(), etc.
#include <cstdio> //for sprintf()
enum // Exercise
{ // =======================================
NUMBER = 2000001, // 1) Create a 2MB string of dashes;
LINE_LENGTH = 90, // 2) Change every 7th dash to a "P";
NUM_PS = NUMBER/7+1, // 3) Replace every "P" with a "PU" (hehehe);
PU_EXT_LENGTH = NUMBER+NUM_PS, // 4) Replace every dash with an "8";
NUM_FULL_LINES = PU_EXT_LENGTH/LINE_LENGTH, // 5) Put in a CrLf every 90 characters;
MAX_MEM = PU_EXT_LENGTH+NUM_FULL_LINES*2 // 6) Output last 4K to Message Box.
};
int __stdcall WinMain(HINSTANCE hInstance, HINSTANCE hPrevIns, LPSTR lpszArg, int nCmdShow)
{
TCHAR szMsg[64],szTmp[16]; //for message box
int i=0,iCtr=0,j; //iterators/counters
TCHAR* s1=NULL; //pointers to null terminated
TCHAR* s2=NULL; //character array bufers
DWORD tick=GetTickCount();
s1=(TCHAR*)GlobalAlloc(GPTR,MAX_MEM*sizeof(TCHAR)); //Allocate two buffers big enough to hold the original NUMBER of chars
s2=(TCHAR*)GlobalAlloc(GPTR,MAX_MEM*sizeof(TCHAR)); //plus substitution of PUs for Ps and CrLfs after each LINE_LENGTH chunk.
for(i=0; i<NUMBER; i++) // 1) Create a 2MB string of dashes
s1[i]=_T('-');
for(i=0; i<NUMBER; i++, iCtr++) // 2) Change every 7th dash to a "P"
{
if(iCtr==7)
{
s1[i]=_T('P');
iCtr=0;
}
}
iCtr=0; // 3) Substitute 'PUs' for 'Ps'
for(i=0; i<NUMBER; i++)
{
if(_tcsncmp(s1+i,_T("P"),1)==0)
{
_tcscpy(s2+iCtr,_T("PU"));
iCtr+=2;
}
else
{
s2[iCtr]=s1[i];
iCtr++;
}
}
for(i=0; i<PU_EXT_LENGTH; i++) // 4) Replace every '-' with an 8;
{
if(s2[i]==_T('-'))
s2[i]=56; //56 is '8'
}
i=0, j=0, iCtr=0; // 5)Put in a CrLf every 90 characters
while(i<PU_EXT_LENGTH)
{
s1[j]=s2[i];
i++, j++, iCtr++;
if(iCtr==LINE_LENGTH)
{
s1[j]=13, j++;
s1[j]=10, j++;
iCtr=0;
}
}
s1[j]=0, s2[0]=0;
_tcsncpy(s2,&s1[j]-4001,4000); // 6) Output last (right most) 4 K to
s2[4000]=0; // MessageBox().
tick=GetTickCount()-tick;
_tcscpy(szMsg,_T("Here's Your String John In ")); //Let me clue you in on something.
_stprintf(szTmp,_T("%u"),(unsigned)tick); //You'll get real tired of this
_tcscat(szMsg,szTmp); //sprintf(), strcpy(), strcat()
_tcscat(szMsg,_T(" ticks!")); //stuff real fast. It'll wear you
MessageBox(0,s2,szMsg,MB_OK); //right into the ground!
GlobalFree(s1), GlobalFree(s2);
return 0;
}
I might add that a 2,000,000 byte string is kind of tight for using low resolution GetTickCount(). For real fast machines you might want to make the string 10 MB or whatever.