parsing multi-line print blocks
- From: Ed Morton <mortonspam@xxxxxxxxx>
- Date: Sat, 28 Jan 2012 09:13:57 -0600
The script below will parse an awk script and turn statements like this:
print <<!
line 1
line 2
line 3
!
into this:
print " line 1"
print " line 2"
print " line 3"
It's part of a shell script designed to pre-process awk scripts so you can specify pre-formatted multi-line print statements (like "cat" with a shell here-document) in your awk scripts for generating code from templates.
I'm interested in general feedback but in particular:
1) Is it "better" (whatever that means to you) to convert the input above into the shown output or should it be converted to a single print instead:
print " line 1\n line 2\n line 3"
or something else?
2) When the pre-formatted block is specified with a leading "<<+" instead of just "<<", the tool attempts to detect and remove all common leading tabs so you can write your pre-formatted block indented with tabs so it looks good in your script without those tabs appearing in the output. Is there a "better" way to do that than what I do today? If they use "<<-" then ALL leading tabs are removed, again just like a shell here document.
The awk script is below within "expand_prints()" function of the full shell script ("epawk") for context.
Yes, I know it doesn't have to be gawk-specific I will adjust that later. I already posted a version of this at comp.unix.shell for input on the shell part of it.
Ed.
##########################################################
# Extended Print AWK
#
# Allows printing of pre-formatted blocks of multi-line text in awk scripts.
#
# Before invoking the tool, do the following IN ORDER:
#
# 1) Start each block of pre-formatted text in your script with
# print << TERMINATOR
# on it's own line and end it with
# TERMINATOR
# on it's own line. TERMINATOR can be any sequence of non-blank characters
# you like. Spaces are allowed around the symbols but are not required.
# If << is followed by -, e.g.:
# print <<- TERMINATOR
# then all leading tabs are removed from the block of pre-formatted
# text (just like shell here documents), if it's followed by + instead, e.g.:
# print <<+ TERMINATOR
# then however many leading tabs are common across all non-blank lines
# in the current pre-formatted block are removed. By default no leading
# tabs are removed. Anything you place after the TERMINATOR will be
# reproduced as-is after every line in the post-processed script, so
# this for example:
# print << HERE |"cat>&2"
# foo
# HERE
# would cause "foo" to be printed to stderr.
#
# 2) Within each block of pre-formatted text only:
# a) Put a backslash character before every backslash (\ -> \\).
# b) Put a backslash character before every double quote (" -> \").
# c) Enclose awk variables in double quotes without leading
# backslashes (awkVar -> "awkVar").
# d) Enclose awk record and field references ($0, $1, $2, etc.)
# in double quotes without leading backslashes ($1 -> "$1").
#
# 3) If the script is specified on the command line instead of via
# "-f script" then replace all single quote characters (') in or out
# of the pre-formatted blocks with their ANSI octal escape sequence (\047)
# or the sequence '\'' (tick backslash tick tick). This is normal and is
# required because command-line awk scripts cannot contain single quote
# characters as those delimit the script. Do not use hex \x27, see
# http://awk.freeshell.org/PrintASingleQuote.
#
# Then just use it like you would gawk with the small caveat that only
# "-W <option>", not "--<option>", is supported for long options so you
# can use "-W re-interval" but not "--re-interval" for example.
#
# To just see the post-processed script and not execute it, call this
# script with the "-X" option.
#
# See the bottom of this file for usage examples.
##########################################################
toolName="$(basename "$0")"
expand_prints() {
gawk '
!inBlock && sub(/^[[:blank:]]*print[[:blank:]]*<</,"") {
if ( sub(/^[-]/,"") ) { skipType = "-" }
else if ( sub(/^[+]/,"") ) { skipType = "+" }
else { skipType = "" }
gsub(/(^[[:blank:]]+|[[:blank:]]+$)/,"")
if (/[[:blank:]]/) {
terminator = $0
sub(/[[:blank:]].*/,"",terminator)
postprint = $0
sub(/[^[:blank:]]+[[:blank:]]+/,"",postprint)
}
else {
terminator = $0
postprint = ""
}
startBlock()
next
}
inBlock {
stripped=$0
gsub(/(^[[:blank:]]+|[[:blank:]]+$)/,"",stripped)
if ( stripped"" == terminator"" ) {
endBlock()
}
else {
updBlock()
}
next
}
{ print }
function startBlock() { inBlock=1; numLines=0 }
function updBlock() { block[++numLines] = $0 }
function endBlock( i,numSkip,indent) {
if (skipType == "") {
# do not skip any leading tabs
indent = ""
}
else if (skipType == "-") {
# skip all leading tabs
indent = "[\t]+"
}
else if (skipType == "+") {
# skip however many leading tabs are common across
# all non-blank lines in the current pre-formatted block
for (i=1;i<=numLines;i++) {
if (block[i] ~ /[^[:blank:]]/) {
match(block[i],/^[\t]+/)
if ( (numSkip == "") || (numSkip > RLENGTH) ) {
numSkip = RLENGTH
}
}
}
for (i=1;i<=numSkip;i++) {
indent = indent "\t"
}
}
for (i=1;i<=numLines;i++) {
sub(indent,"",block[i])
print "print \"" block[i] "\"\t" postprint
}
inBlock=0
}
' "$@"
}
unset awkArgs
unset scriptFiles
expandOnly=0
while getopts "v:F:W:f:X" arg
do
case $arg in
f ) scriptFiles+=( "$OPTARG" ) ;;
[vFW] ) awkArgs+=( "-$arg" "$OPTARG" ) ;;
X ) expandOnly=1 ;;
* ) exit 1 ;;
esac
done
shift $(( OPTIND - 1 ))
if [ -z "${scriptFiles[*]}" -a "$#" -gt "0" ]
then
# The script cannot contain literal 's because in cases like this:
# 'BEGIN{ ...abc'def... }'
# the args parsed here (and later again by gawk) would be:
# $1 = BEGIN{ ...abc
# $2 = def... }
# Replace 's with \047 or '\'' if you need them:
# 'BEGIN{ ...abc\047def... }'
# 'BEGIN{ ...abc'\''def... }'
scriptText="$1"
shift
fi
# Remaining symbols in "$@" must be data file names and/or variable
# assignments that do not use the "-v name=value" syntax.
if [ -n "${scriptFiles[*]}" ]
then
if (( expandOnly == 1 ))
then
expand_prints "${scriptFiles[@]}"
else
gawk "${awkArgs[@]}" "$(expand_prints "${scriptFiles[@]}")" "$@"
fi
elif [ -n "$scriptText" ]
then
if (( expandOnly == 1 ))
then
printf '%s\n' "$scriptText" | expand_prints
else
gawk "${awkArgs[@]}" "$(printf '%s\n' "$scriptText" | expand_prints)" "$@"
fi
else
printf '%s: ERROR: no awk script specified.\n' "$toolName" >&2
exit 1
fi
exit
##########################################################
USAGE EXAMPLES:
$ cat data.txt
abc def"ghi
$
#######
$ cat script.awk
{
awkVar="bar"
print "----------------"
print << HERE
backslash: \\
quoted text: \"text\"
single quote as ANSI sequence: \047
literal single quote (ONLY works when script is in a file): '
awk variable: "awkVar"
awk field: "$2"
HERE
print "----------------"
print <<-!
backslash: \\
quoted text: \"text\"
single quote as ANSI sequence: \047
literal single quote (ONLY works when script is in a file): '
awk variable: "awkVar"
awk field: "$2"
!
print "----------------"
print <<+ whatever
backslash: \\
quoted text: \"text\"
single quote as ANSI sequence: \047
literal single quote (ONLY works when script is in a file): '
awk variable: "awkVar"
awk field: "$2"
whatever
print "----------------"
}
$ epawk -f script.awk data.txt
----------------
backslash: \
quoted text: "text"
single quote as ANSI sequence: '
literal single quote (ONLY works when script is in a file): '
awk variable: bar
awk field: def"ghi
----------------
backslash: \
quoted text: "text"
single quote as ANSI sequence: '
literal single quote (ONLY works when script is in a file): '
awk variable: bar
awk field: def"ghi
----------------
backslash: \
quoted text: "text"
single quote as ANSI sequence: '
literal single quote (ONLY works when script is in a file): '
awk variable: bar
awk field: def"ghi
----------------
#######
$ epawk -F\" '{
print <<!
ANSI-tick-surrounded quote-separated field 2 (will work): \047"$2"\047
!
}' data.txt
ANSI-tick-surrounded quote-separated field 2 (will work): 'ghi'
$
#######
epawk -F\" '{
print <<!
Shell-escaped-tick-surrounded quote-separated field 2 (will work): '\''"$2"'\''
"
}' data.txt
Shell-escaped-tick-surrounded quote-separated field 2 (will work): 'ghi'
$
#######
$ epawk -F\" '{
print <<!
Literal-tick-surrounded quote-separated field 2 (will not work): '"$2"'
!
}' data.txt
Literal-tick-surrounded quote-separated field 2 (will not work):
$
#######
$ epawk -X 'BEGIN{
print <<!
foo
bar
!
}'
BEGIN{
print " foo"
print " bar"
}
$
#######
$ cat file
a
b
c
$ epawk '{
print <<+! |"cat>o2"
numLines="NR"
numFields="NF", $0="$0", $1="$1"
!
}' file
$ cat o2
numLines=1
numFields=1, $0=a, $1=a
numLines=2
numFields=1, $0=b, $1=b
numLines=3
numFields=1, $0=c, $1=c
$
#######
$ epawk 'BEGIN{
cmd = "sort"
print <<+! |& cmd
d
b
a
c
!
close(cmd, "to")
while ( (cmd |& getline line) > 0 ) {
print "got:", line
}
close(command)
}' file
got: a
got: b
got: c
got: d
$
.
- Prev by Date: Re: select(2) gawk extension
- Next by Date: Beta release of gawk 4.0.1
- Previous by thread: select(2) gawk extension
- Next by thread: Beta release of gawk 4.0.1
- Index(es):
Relevant Pages
|