Re: String trim (was JavaScript Functions)



kangax wrote:
Garrett Smith wrote:
[...]
Is this fixed in recent JScript in IE8?

I'll check this later, will let you know.

If that question was whether \s in a regular expression matches '\u00A0' in IE 8 then it appears that the answer is no.

Question: Why not:-

function trimString(s){
return s.replace(/^[\s|\xA0]+|[\s|\xA0]+$/g, '');
}

That would work, sure.
<snip>

By "work" I assume you mean that it corrects the fact that some javascript engines do not match '\u00A0' (the non-breaking space character) when \s is used in a regular expression. But is this an attempt to create methods that conform to the ECMA (3rd Ed.) specification, and thus are consistent across platforms, or is it a desire to arbitrarily include the non-breaking space in the set of matched characters for a trim function.

The former seems the more reasonable goal, but in that case this trim function still falls short as where '\u00A0' is not matched the odds are extremely good that '\u2003' (EM space), to name but one, is not matched either. So the above function will still produce inconsistent results between javascript engines.

The ECMA spec wants \s to match javascript's whit space characters and it line terminator characters, but the definition of whitespace includes all of Unicode's Zs group, and JScript, for example, does not match the majority of those.

Try this out:-

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd";>
<html>
<head>
<title></title>
</head>
<body>
<pre>
<script type="text/javascript">

var WhiteSpace = [
{
cp:"9", codePoint:"0x0009", character :"\u0009",
name:"<control>[ASCII Tab]", group:"Cc"
},
{
cp:"11", codePoint:"0x000B", character :"\u000B",
name:"<control>[ASCII Vertical Tab]", group:"Cc"
},
{
cp:"12", codePoint:"0x000C", character :"\u000C",
name:"<control>[ASCII Form Feed]", group:"Cc"
},
{
cp:"32", codePoint:"0x0020", character :"\u0020",
name:"SPACE", group:"Zs"
},
{
cp:"160", codePoint:"0x00A0", character :"\u00A0",
name:"NO-BREAK SPACE", group:"Zs"
},
{
cp:"5760", codePoint:"0x1680", character :"\u1680",
name:"OGHAM SPACE MARK", group:"Zs"
},
{
cp:"6158", codePoint:"0x180E", character :"\u180E",
name:"MONGOLIAN VOWEL SEPARATOR", group:"Zs"
},
{
cp:"8192", codePoint:"0x2000", character :"\u2000",
name:"EN QUAD", group:"Zs"
},
{
cp:"8193", codePoint:"0x2001", character :"\u2001",
name:"EM QUAD", group:"Zs"
},
{
cp:"8194", codePoint:"0x2002", character :"\u2002",
name:"EN SPACE", group:"Zs"
},
{
cp:"8195", codePoint:"0x2003", character :"\u2003",
name:"EM SPACE", group:"Zs"
},
{
cp:"8196", codePoint:"0x2004", character :"\u2004",
name:"THREE-PER-EM SPACE", group:"Zs"
},
{
cp:"8197", codePoint:"0x2005", character :"\u2005",
name:"FOUR-PER-EM SPACE", group:"Zs"
},
{
cp:"8198", codePoint:"0x2006", character :"\u2006",
name:"SIX-PER-EM SPACE", group:"Zs"
},
{
cp:"8199", codePoint:"0x2007", character :"\u2007",
name:"FIGURE SPACE", group:"Zs"
},
{
cp:"8200", codePoint:"0x2008", character :"\u2008",
name:"PUNCTUATION SPACE", group:"Zs"
},
{
cp:"8201", codePoint:"0x2009", character :"\u2009",
name:"THIN SPACE", group:"Zs"
},
{
cp:"8202", codePoint:"0x200A", character :"\u200A",
name:"HAIR SPACE", group:"Zs"
},
{
cp:"8239", codePoint:"0x202F", character :"\u202F",
name:"NARROW NO-BREAK SPACE", group:"Zs"
},
{
cp:"8287", codePoint:"0x205F", character :"\u205F",
name:"MEDIUM MATHEMATICAL SPACE", group:"Zs"
},
{
cp:"12288",codePoint:"0x3000", character :"\u3000",
name:"IDEOGRAPHIC SPACE", group:"Zs"
}
];

var LineTerminator = [
{
cp:"8232", codePoint:"0x2028", character :"\u2028",
name:"LINE SEPARATOR", group:"Zl"
},
{
cp:"8233", codePoint:"0x2029", character :"\u2029",
name:"PARAGRAPH SEPARATOR", group:"Zp"
},
{
cp:"10", codePoint:"0x000A", character :"\u000A",
name:"<control>[ASCII Line Feed]", group:"Cc"
},
{
cp:"13", codePoint:"0x000D", character :"\u000D",
name:"<control>[ASCII Carriage Return]", group:"Cc"
}
];

var NonWhiteSpace = [
{
cp:"8203", codePoint:"0x200B", character :"\u200B",
name:"ZERO WIDTH SPACE (will be whitespace in ES 3.1)",
group:"Cf"
},
{
cp:"8204", codePoint:"0x200C", character :"\u200C",
name:"ZERO WIDTH NON-JOINER", group:"Cf"
},
{
cp:"8205", codePoint:"0x200D", character :"\u200D",
name:"ZERO WIDTH JOINER", group:"Cf"
},
{
cp:"65279", codePoint:"0xFEFF", character :"\uFEFF",
name:"ZERO WIDTH NO-BREAK SPACE (will be whitespace in ES 3.1)",
group:"Cf"
}
];



var testAr = [WhiteSpace, LineTerminator];

var testRX = /\s/g;
function testWS(chObj){
var char, stripped, matches;
if(!(char = String.fromCharCode(+chObj.cp))){
return 'Anomaly: None empty string is false.\n'
}


/* First verify that the String.fromCharCode result matches the
Unicode escape sequence based string literal for the character.
*/
if(char != chObj.character){
return 'Anomaly in String.String.fromCharCode('+chObj.cp+')\n'
}
if((stripped = char.replace(testRX, '')) == chObj.character){
matches = false;
}else{
matches = true;
}
return (
matches+
'\t"'+stripped+'"\tcp = '+chObj.codePoint+'\tname = '+
chObj.name+' group = '+chObj.group+'\n'
);
}

var c, cp, list, ctr;
document.write('Should be matched by \\s\n');
for(c = 0;c < testAr.length;++c){
list = testAr[c];
for(cp = 0;cp < list.length;++cp){
ctr = list[cp];
document.write(testWS(ctr)
.replace('true', 'GOOD')
.replace('false', 'FAIL'));

}
}

document.write('\n\nMust not be matched by \\s (in ES 3)\n');
for(cp = 0;cp < NonWhiteSpace.length;++cp){
ctr = NonWhiteSpace[cp];
document.write(testWS(ctr)
.replace('true', 'FAIL')
.replace('false', 'GOOD'));

}
</script>
</pre>
</body>
</html>

Richard.

.



Relevant Pages

  • Re: Empty input box
    ... That regular expression will match a string consisting only of one or more 's' characters. ... it will return null (which type-converts to boolean false). ... That will match a string consisting of one or more whitespace characters only. ...
    (comp.lang.javascript)
  • Re: regex/replace white list
    ... Whether to use a white list (i.e. list of allowed characters) or a black list is probably best decided by which one gives the smaller list. ... As far as a regular expression is concerned, the difference between the two is whether to use the NOT operator or not. ... // someString doesn't contain the letter 'a' ... var re = new RegExp; ...
    (comp.lang.javascript)
  • Re: find word(s) in string
    ... I would consider using a regular expression. ... If you need to know if you matched either ICE or NC, ... = match either the beginning of the line or one or more whitespace ... characters. ...
    (microsoft.public.dotnet.languages.vb)
  • Re: Regular Expression Function
    ... I want a regular expression to compare sentences and then rate them as ... I have an array with a list of other phrases like so... ... characters will throw things off. ... "In an hour the system will go down for maintenance". ...
    (alt.php)
  • Re: Regular Expression Function
    ... I want a regular expression to compare sentences and then rate them as ... I have an array with a list of other phrases like so.. ... These will be stripped from the input first. ... characters will throw things off. ...
    (alt.php)