Author Topic: String Clean / Assembler  (Read 10378 times)

0 Members and 1 Guest are viewing this topic.

Offline Charles Pegge

  • Global Moderator
  • Hero Member
  • *****
  • Posts: 672
  • User-Rate: +27/-1
    • Charles Pegge
String Clean / Assembler
« on: July 12, 2007, 01:12:41 AM »
This cleans up a string, converting control characters to spaces and then removing excess soaces. Also trims the string left and right and converts it to lower case.

It works directly on a string buffer without creating a new string. It is ruthlessly efficient.


PowerBasic

Code: [Select]
' String Clean
' Charles E V Pegge
' 12 July 2007

' PowerBasic ver 8.x

#COMPILE EXE
#DIM ALL

SUB StringClean(BYREF ss AS STRING, BYREF le AS LONG)

' Uses the same string buffer for source and result

' In one pass:
'  trims off start and end spaces
'  excludes multiple spacing
'  converts invisible chars to spaces ( tab cr and others)
'  converts uppercase to lowercase

 #REGISTER NONE           ' disallow compiler to assume register availability
                          '
 DIM p AS LONG            '
 p=STRPTR(ss)             ' make a pointer to the string buffer content
'-------------------------'
  ! mov esi,p             ' put this pointer into esi for source
  ! mov edi,esi           ' and copy it into the dest pointer
  ! mov ebx,esi           ' base ptr and copy it into the base pointer
  ! dec edi               ' offset dest pointer by one
  ! mov edx, le           ' address of length
  ! mov edx, [edx]        ' get the length of string bytes to process
  ! add edx, esi          ' use this to calc the string boundary
'-------------------------'
 nextl:                   ' main loop
'-------------------------'
  ! cmp esi,edx           ' boundary check
  ! jge xit ' finish      ' finish if at end
  ! mov al,[esi]          ' load current char
  ! inc esi               ' increment for the next char
  ! cmp al,64             ' upper case check
  ! jle ncase             ' skip if brlow A
  ! cmp al,90             '
  ! jg ncase              ' skip if above Z
  ! or al,&h20            ' convert upper to lower
'-------------------------'
 ncase:                   '
'-------------------------'
  ! cmp al,32             ' is it a space or lesser char?
  ! jg nspace             ' skip if not
  ! cmp edi,ebx           ' anything assigned yet?
  ! jl nosp               ' if not then we dont allow a leading space
  ! cmp byte ptr [edi],32 ' was the previous assigned char a space?
  ! jle nosp              ' then we dont allow another space to go in
  ! inc edi               ' other wise pre-increment edi
  ! mov byte ptr [edi],32 ' assign the space
'-------------------------'
 nosp:                    '
'-------------------------'
  ! jmp nextl             ' repeat the loop
'-------------------------'
 nspace:                  '
 '------------------------'
  ! inc edi               ' preincrement edi
  ! mov [edi],al          ' assign the char
  ! jmp nextl             ' repeat the loop
'-------------------------'
 xit:                     '
'-------------------------'
  ! cmp edi,ebx           ' null string check
  ! jl ntrim              ' skip trim check
  ! cmp byte ptr [edi],32 ' is the last char a space?
  ! jnz ntrim             ' if not then skip
  ! dec edi               ' decrement edi to exclude this space
'-------------------------'
 ntrim:                   '
'-------------------------'
  ! sub edi,ebx           ' get difference from base
  ! inc edi               ' add 1 to yield the string length
  ! mov edx,le            ' get pointer to length value
  ! mov [edx],edi         ' assign length of altered string
'-------------------------'

END SUB



FUNCTION PBMAIN () AS LONG

DIM ss AS STRING
DIM le AS LONG

ss="   ONE   "+$CR+"   two    three  four    ":le=LEN(ss)
StringClean(ss,le)

MSGBOX ">>"+LEFT$(ss,le)+"<<"

END FUNCTION
« Last Edit: July 12, 2007, 08:18:15 AM by Charles Pegge »

Offline Paul Squires

  • Jr. Member
  • **
  • Posts: 90
  • User-Rate: +11/-5
    • PlanetSquires
Re: String Clean / Assembler
« Reply #1 on: July 12, 2007, 01:15:07 AM »
A very useful function indeed especially since it does the modifications in place. Very impressive! I'm sure that I will find a need for this type of function.

Thanks for sharing. Keep 'em coming!  :)

Paul Squires
FireFly Visual Designer SQLitening Database System JellyFish Pro Editor
http://www.planetsquires.com

Offline Charles Pegge

  • Global Moderator
  • Hero Member
  • *****
  • Posts: 672
  • User-Rate: +27/-1
    • Charles Pegge
Re: String Clean / Assembler
« Reply #2 on: July 12, 2007, 01:21:00 PM »
Thank you Paul,
Any suggestions welcome.

Offline Charles Pegge

  • Global Moderator
  • Hero Member
  • *****
  • Posts: 672
  • User-Rate: +27/-1
    • Charles Pegge
Re: String Clean / Assembler
« Reply #3 on: July 13, 2007, 11:02:38 AM »

String Clean without using Assembler:

Code: [Select]

' String CleanB
' Charles E V Pegge
' 13 July 2007

' PowerBasic ver 8.x


#COMPILE EXE
#DIM ALL

' parameters:
' 1 dynamic string
' 2 number of characters to be processed
'
' result:
' 1 cleaned up string
' 2 character count of cleaned up string

SUB StringCleanB(BYREF ss AS STRING, BYREF d AS LONG) ' Remove excess spaces

' trims off start and end spaces
' excludes multiple spacing
' converts invisible / null spaces to spaces

' In this procedure the source and destination is the same string

DIM en AS LONG    ' length of source string
DIM p AS BYTE PTR ' source string pointer
DIM q AS BYTE PTR ' dest string pointer
DIM c AS LONG     ' source string counter

en=d ' length of source string
c=0:d=0
p=STRPTR(ss):q=p-1
DO
 IF c>=en THEN EXIT DO
 IF @p<=32 THEN ' to replace all invis chars
 ' alternatively:
 'if (@p=0)or(@p=32) then ' to replace nulls only
  IF d>0 THEN IF @q>32 THEN INCR q:@q=32: INCR d
 ELSE
  INCR q: @q=@p: INCR d
 END IF
 INCR p: INCR c
LOOP
IF d>0 THEN IF @q=32 THEN DECR d ' remove remaining right space

END SUB



FUNCTION PBMAIN () AS LONG
DIM ss AS STRING
DIM le AS LONG
ss="   one   "+$CR+"   two    three      "
le=LEN(ss)

StringCleanB(ss,le)
MSGBOX ">>"+LEFT$(ss,le)+"<<"

END FUNCTION



MikeTrader

  • Guest
Re: String Clean / Assembler
« Reply #4 on: July 13, 2007, 09:06:35 PM »
Charles,
Fantastic. Thx so much!

Offline Kent Sarikaya

  • Full Member
  • ***
  • Posts: 173
  • User-Rate: +8/-4
Re: String Clean / Assembler
« Reply #5 on: July 14, 2007, 09:43:04 AM »
Thanks for the code examples. These will be very useful.

Offline Charles Pegge

  • Global Moderator
  • Hero Member
  • *****
  • Posts: 672
  • User-Rate: +27/-1
    • Charles Pegge
Re: String Clean / Assembler
« Reply #6 on: July 15, 2007, 09:55:07 PM »
Code Clean

This is a more sophisticated version of String Clean which is intended for cleaning up source code prior to compilation or scripting. It allows test contained between quotes to go through with all spaces and other characters unaltered while removing excess spaces from the rest of the text.

Comments are detected and stripped off, and line continuation (underscore) also truncates the line. Leading and trailing spaces are removed after the line has been truncated in this way. Finally, the ascii code causing the truncation is returned by the function.


PowerBasic

Code: [Select]

' CodeClean
' Charles E V Pegge
' 15 July 2007

' PowerBasic V 8.x

#COMPILE EXE
#DIM ALL


FUNCTION CodeClean(BYREF s AS STRING, BYREF le AS LONG) AS LONG
   
#REGISTER NONE

' cuts off at comments !//! ascii 47 47
' cuts off at comments !|! ascii 124
' cuts off at line continuations '_' asci 95
' cuts off at ascii 39 ' as first character
' then strips left and right spaces
' excludes multiple spaces except within quotes "''"
' returns last ascii code parsed

' Parameters:
' 1 source/dest string
' 2 length of string to be parsed

' Return:
' ascii code of last character parsed.


DIM ps AS BYTE PTR
ps=STRPTR(s)
'asm
'===================='
! mov esi,ps         ' string source
! mov edi,esi        ' string dest
! mov ebx,edi        ' last non-space position
! mov ecx,le         ' length pointer
! mov ecx,[ecx]      ' source length down counter
! xor edx,edx        ' length counter
! xor eax,eax        ' char
! dec esi            ' adjust pointer for overshoot
'--------------------'
rep_ltrim:           '
'--------------------'
! inc esi            ' for next char
! dec ecx            ' any chars left?
! jl exit_extract    ' if not then finish
! cmp byte ptr [esi],32
! jle rep_ltrim      ' continue left trimming
'--------------------'
! mov al,[esi]       '
'--------------------'
!                    ' check for comment symbols
!                    '
! cmp al,39          ' line starting with single quote
! jz exit_extract    '
! cmp al,42          ' line starting with star *
! jz exit_extract    '
! cmp al,47          ' line starting with slash /
! jz exit_extract    '
!                    '
'--------------------'
! inc ecx            ' adjust counter for overshoot
'===================='
rep_extract:         ' loop point
'--------------------'
! dec ecx            ' any more chars?
! jl rtrim_extract   ' if not then finish
! mov al,[esi]       ' get the character
! inc esi            ' increment the char pointer
'--------------------'
! cmp al,34          ' is it a double quote?
! jz quo             ' procede to skip over the quote
! cmp al,39          ' is it a single quote?
! jz quo             ' if not then skip over quote section
! jmp nquo           ' otherwise skip over quotes section
'--------------------'
quo:                 '
'--------------------'
! mov ah,al          ' hold char in ah to compares
! mov [edi],al       ' store it
! inc edi
! mov ebx,edi        ' track non-space
'--------------------'
rep_quo:            ' loop
! dec ecx            ' any chars left?
! jl exit_extract    ' finish if not
! mov al,[esi]       ' get char
! inc esi            ' next
! mov [edi],al       ' store char
! inc edi            ' next location
! cmp al,ah          ' compare with quote char at beginning
! jnz rep_quo        ' continue looping if not end-quote
'--------------------'
! mov ah,0           ' clear ah of quote char
! mov ebx,edi        ' mark as non space
! jmp rep_extract    ' continue with main loop
'--------------------'
nquo:                '
'--------------------'
! cmp al,47          ' check for '//'
! jnz n47            ' skip if not ascii 47
! cmp ecx,0          ' end of string?
! jz n47             ' skip id so
! cmp byte ptr [esi],47 ' look ahead for '/'
! jz rtrim_extract   '
'--------------------'
n47:                 '
'--------------------'
! cmp al,124         ' check vertical bar as comment marker
! jz rtrim_extract   ' trim off as exit
! cmp al,95          ' check under score as line continuation
! jz rtrim_extract   ' trim off and exit
! cmp al,32          ' is it a space?
! jnz nspace         ' if not then procede to store char
'--------------------'
! cmp edi,ebx        ' was there a previous space
! jnz rep_extract    ' if there was then do not store another
'--------------------'
nspace:              '
'--------------------'
! mov [edi],al       ' store at dest
! inc edi            ' increment dest pointer
! cmp al,32          ' is it a space?
! jle rep_extract    ' then skip updating ebx
! mov ebx,edi        ' otherwise record the edi ptr in ebx
! jmp rep_extract    ' repeat
'--------------------'
rtrim_extract:       ' auto eliminate trailing spces from the length count
'--------------------'
exit_extract:        '
'--------------------'
! sub ebx,ps         ' get length
! mov ecx,le         ' get the le pointer
! mov [ecx],ebx      ' store the length
! mov ah,0           ' clear any ah quote matchers       '
! mov function,eax ' store the last char encountered
'===================='
'end asm

END FUNCTION


FUNCTION PBMAIN()

DIM ss AS STRING
DIM le AS LONG
DIM aa AS LONG

ss=" a  =    b+c + 'z  |   //  z'  //|  this is a comment "
'ss="    1* "
'ss=" *  "
'ss="  "
le=LEN(ss)
aa=CodeClean(ss,le)

MSGBOX ">>"+LEFT$(ss,le)+"<<"+"   length:"+STR$(le)+"    terminator code:"+STR$(aa)

END FUNCTION

« Last Edit: July 15, 2007, 10:06:37 PM by Charles Pegge »

Offline Charles Pegge

  • Global Moderator
  • Hero Member
  • *****
  • Posts: 672
  • User-Rate: +27/-1
    • Charles Pegge
Re: String Clean / Assembler
« Reply #7 on: July 15, 2007, 10:09:43 PM »

CodeClean

The equivalent in Freebasic Assembler


FreeBasic

Code: [Select]
' CodeClean
' Charles E V Pegge
' 15 July 2007

' FREEBASIC 0.16b

function CodeClean(byref s as string, byref le as long) as long

' cuts off at comments `//` ascii 47 47
' cuts off at comments `|` ascii 124
' cuts off at line continuations '_' asci 95
' cuts off at ascii 39 ' as first character
' then strips left and right spaces
' excludes multiple spaces except within quotes "''"
' returns last ascii code parsed

' Parameters:
' 1 source/dest string
' 2 length of string to be parsed

' Return:
' ascii code of last character parsed.


dim ps as byte ptr
ps=strptr(s)
asm
'==================='
 mov esi,[ps]       ' string source
 mov edi,esi        ' string dest
 mov ebx,edi        ' last non-space position
 mov ecx,[le]       ' length pointer
 mov ecx,[ecx]      ' source length down counter
 xor edx,edx        ' length counter
 xor eax,eax        ' char
 dec esi            ' adjust pointer for overshoot
'-------------------'
rep_ltrim:          '
'-------------------'
 inc esi            ' for next char
 dec ecx            ' any chars left? 
 jl exit_extract    ' if not then finish
 cmp byte ptr [esi],32
 jle rep_ltrim      ' continue left trimming
'-------------------'
 mov al,[esi]       '
'-------------------'
                    ' check for comment symbols
                    '
 cmp al,39          ' line starting with single quote
 jz exit_extract    '
 cmp al,42          ' line starting with star *
 jz exit_extract    '
 cmp al,47          ' line starting with slash /
 jz exit_extract    '
                    '
'-------------------'
 inc ecx            ' adjust counter for overshoot
'==================='
rep_extract:        ' loop point
'-------------------'
 dec ecx            ' any more chars?
 jl rtrim_extract   ' if not then finish
 mov al,[esi]       ' get the character
 inc esi            ' increment the char pointer
'-------------------'
 cmp al,34          ' is it a double quote?
 jz quo             ' procede to skip over the quote
 cmp al,39          ' is it a single quote?
 jz quo             ' if not then skip over quote section
 jmp nquo           ' otherwise skip over quotes section
'-------------------'
quo:                '
'-------------------'
 mov ah,al          ' hold char in ah to compares
 mov [edi],al       ' store it
 inc edi
 mov ebx,edi        ' track non-space
'-------------------'
rep_quo:            ' loop
 dec ecx            ' any chars left?
 jl exit_extract    ' finish if not
 mov al,[esi]       ' get char
 inc esi            ' next
 mov [edi],al       ' store char
 inc edi            ' next location
 cmp al,ah          ' compare with quote char at beginning
 jnz rep_quo        ' continue looping if not end-quote
'-------------------'
 mov ah,0           ' clear ah of quote char
 mov ebx,edi        ' mark as non space
 jmp rep_extract    ' continue with main loop
'-------------------'
nquo:               '
'-------------------'
 cmp al,47          ' check for '//'
 jnz n47            ' skip if not ascii 47
 cmp ecx,0          ' end of string?
 jz n47             ' skip id so
 cmp byte ptr [esi],47 ' look ahead for '/'
 jz rtrim_extract   '
'-------------------'
n47:                '
'-------------------'
 cmp al,124         ' check vertical bar as comment marker
 jz rtrim_extract   ' trim off as exit
 cmp al,95          ' check under score as line continuation
 jz rtrim_extract   ' trim off and exit
 cmp al,32          ' is it a space?
 jnz nspace         ' if not then procede to store char
'-------------------'
 cmp edi,ebx        ' was there a previous space
 jnz rep_extract    ' if there was then do not store another
'-------------------'
nspace:             '
'-------------------'
 mov [edi],al       ' store at dest
 inc edi            ' increment dest pointer
 cmp al,32          ' is it a space?
 jle rep_extract    ' then skip updating ebx
 mov ebx,edi        ' otherwise record the edi ptr in ebx
 jmp rep_extract    ' repeat
'-------------------'
rtrim_extract:      ' auto eliminate trailing spces from the length count
'-------------------'
exit_extract:       '
'-------------------'
 sub ebx,[ps]       ' get length
 mov ecx,[le]       ' get the le pointer
 mov [ecx],ebx      ' store the length
 mov ah,0           ' clear any ah quote matchers       '
 mov [function],eax ' store the last char encountered
'==================='
end asm

end function


dim ss as string
dim le as long
dim aa as long

ss=" a  =    b+c + 'z  |   //  z'  //|  this is a comment "
'ss="    1* "
'ss=" *  "
'ss="  "
le=len(ss)
aa=CodeClean(ss,le)

print ">>"+left$(ss,le)+"<<",le,aa

end