#!/usr/bin/perl -w my $VERSION = "0.13"; # vim: set sw=4 ts=4 si et: # Copyright: GPL # Author: Guido Socher # use strict; use vars qw($opt_h $opt_l); use Getopt::Std; # sub help(); #------------------ getopts("lh")||die "ERROR: No such option. -h for help.\n"; help() if ($opt_h); help() unless($ARGV[0]); my $pattern=shift; $/='<'; # record seperator, normally "\n" but in html "\n" means nothing my $i=1; my $l=1; while(<>){ while(s/\r?\n/ /){ # keep track of line numbers $i++; } # take away the end tag: s/>.*//; # kill multiple space s/[ \t]+/ /g; if(/$pattern/io){ # matches this tag. Print filename:linenumber: matched tag print "${ARGV}:${i}: " if ($opt_l); print "<$_>\n"; $l=$i; # we want to count the line where the tag starts next; } $l=$i; # we want to count the line where the tag starts } # sub help(){ print "tr_tagcontentgrep -- grep for a xml/sgml/html tag USAGE: tr_tagcontentgrep [-hl] regexp-pattern [file ...] tr_tagcontentgrep opens all files provided on the command line and searches for the given pattern in the tags. The search is not case sensitive. All space in the tags is reduced to max. one space. You can search for \"a href\" even if the original tag had multiple spaces between \"a\" and \"href\". OPTIONS: -h this help -l list filename and line number EXAMPLE: tr_tagcontentgrep -l img file.html would e.g print something like: index.html:53: <IMG src=\"../images/transpix.gif\" alt=\"\"> index.html:257: <IMG SRC=\"../Logo.gif\" width=128 height=53> tr_tagcontentgrep is part of the HTML::TagReader package but is an example that you can also do 'reading by tag' without HTML::TagReader. tr_tagcontentgrep uses plain perl and sets the \$/ variable. Working without HTML::TagReader causes however problems when working with faulty html code where single '<'-characters appear somewhere in the text. version $VERSION "; exit(0); } __END__ =head1 NAME tr_tagcontentgrep -- grep for a xml/sgml/html tag =head1 SYNOPSIS tr_tagcontentgrep [-hl] regexp-pattern [file ...] =head1 DESCRIPTION tr_tagcontentgrep opens all files provided on the command line and searches for the given pattern in the tags. The search is not case sensitive. tr_tagcontentgrep is part of the HTML::TagReader package but is an example that you can also do 'reading by tag' without HTML::TagReader. tr_tagcontentgrep uses plain perl and sets the $/ variable. Working without HTML::TagReader causes however problems when working with faulty html code where single '<'-characters appear somewhere in the text. All space in the tags is reduced to max. one space. You can search for "a href" even if the original tag had multiple spaces between "a" and "href". =head1 OPTIONS -h this help -l list filename and line number =head1 EXAMPLE tr_tagcontentgrep -l img file.html would e.g print something like: index.html:53: <IMG src="../images/transpix.gif" alt=""> index.html:257: <IMG SRC="../Logo.gif" width=128 height=53> =head1 AUTHOR tr_tagcontentgrep is part of the HTML::TagReader package and was written by Guido Socher [guido(at)linuxfocus.org] =cut