#!/usr/bin/perl  -I/home/phil/perl/cpan/DataEditXml/lib/ -I/home/phil/perl/cpan/DataTableText/lib/
#-------------------------------------------------------------------------------
# Cross reference Dita XML, match topics and ameliorate missing references.
# Philip R Brenan at gmail dot com, Appa Apps Ltd Inc, 2016-2018
#-------------------------------------------------------------------------------
# podDocumentation
# It is easier to criticize than to fix - but we do some fixes any way.
# Check for image formats that will not display in a browser
# Needs more tests
# Do not consider companion files!
# Images that are referenced by topics which are not referenced by bookmaps showup as referenced
# CONREF processing in reportReferencesFromBookmaps
# Fix xref external/scope and eliminate error count if fixbadrefs in operation.

package Data::Edit::Xml::Xref;
our $VERSION = 20190524;
use v5.20;
use warnings FATAL => qw(all);
use strict;
use Carp qw(confess cluck);
use Data::Dump qw(dump);
use Data::Edit::Xml;
use Data::Table::Text qw(:all);
use utf8;

sub maximumFileNameChars {128}                                                  # Maximum number of characters to use in constructing a file name component

#D1 Cross reference                                                             # Check the cross references in a set of Dita files and report the results.

sub improvementLength {80}                                                      #P Improvement length

sub xref(%)                                                                     # Check the cross references in a set of Dita files held in  L<inputFolder|/inputFolder> and report the results in the L<reports|/reports> folder. The possible attributes are defined in L<Data::Edit::Xml::Xref|/Data::Edit::Xml::Xref>
 {my (%attributes) = @_;                                                        # Attributes
  my $xref = genHash(__PACKAGE__,                                               # Attributes used by the Xref cross referencer.
    addNavTitles=>undef,                                                        # If true, add navtitle to topicrefs to show the title of the target
    attributeCount=>{},                                                         # {file}{attribute name} == count of the different xml attributes found in the xml files.
    attributeNamesAndValuesCount=>{},                                           # {file}{attribute name}{value} = count
    author=>{},                                                                 # {file} = author of this file.
    badBookMaps=>{},                                                            # Bad book maps.
    badConRefs=>{},                                                             # {sourceFile} = [file, href] indicating the file has at least one bad conref.
    badConRefsList=>{},                                                         # Bad conrefs - by file.
    badGuidHrefs=>{},                                                           # Bad conrefs - all.
    badImageRefs=>{},                                                           # Consolidated images missing.
    badNavTitles=>{},                                                           # Details of nav titles that were not resolved
    badTables=>[],                                                              # Array of tables that need fixing.
    badTopicRefs=>{},                                                           # [file, href]   Invalid href attributes found on topicref tags.
    badXml1=>{},                                                                # [Files] with a bad xml encoding header on the first line.
    badXml2=>{},                                                                # [Files] with a bad xml doc type on the second line.
    badXRefs=>{},                                                               # Bad Xrefs - by file
    badXRefsList=>{},                                                           # Bad Xrefs - all
    baseTag=>{},                                                                # Base Tag for each file
    conRefs=>{},                                                                # {file}{href}   Count of conref definitions in each file.
    deguidize=>undef,                                                           #I Set true to replace guids in dita references with file name. Given reference g1#g2/id convert g1 to a file name by locating the topic with topicId g2.  This requires the guids to be genuinely unique. SDL guids are thought to be unique by language code but the same topic, translated to a different language might well have the same guid as the original topic with a different language code: =(de|en|es|fr).  If the source is in just one language then the guid uniqueness is a reasonable assumption.  If the conversion can be done in phases by language then the uniqueness of guids is again reasonably assured. L<Data::Edit::Xml::Lint> provides an alternative solution to deguidizing by using labels to record the dita reference in the input corpus for each id encountered, these references can then be resolved in the usual manner by L<Data::Edit::Xml::Lint::relint>.
    debugTimes=>undef,                                                          #I Write timing information if true
    docType=>{},                                                                # {file} == docType:  the docType for each xml file.
    duplicateIds=>{},                                                           # [file, id]     Duplicate id definitions within each file.
    duplicateTopicIds=>{},                                                      # Duplicate topic ids
    duplicateTopicIds=>{},                                                      # [topicId, [files]] Files with duplicate topic ids - the id on the outermost tag.
    fileExtensions=>[qw(.dita .ditamap .xml .fodt)],                            # Default file extensions to load
    fixBadRefs=>undef,                                                          #I Try to fix bad references in L<these files|/fixRefs> where possible by either changing a guid to a file name assuming the right file is present in the corpus nbing scanned and L<deguidize|/deguidize> has been set true or failing that by moving the failing reference to the "xtrf" attribute.
    fixRefs=>{},                                                                # {file}{ref} where the href or conref target is not valid.
    fixedRefs=>[],                                                              # [] hrefs and conrefs from L<fixRefs|/fixRefs which were invalid but have been fixed by L<deguidizing|/deguidize> them to a valid file name.
    fixedFolder=>undef,                                                         # Fixed files are placed in this folder if L<fixBadRefs|/fixBadRefs> has been specified.
    fixedRefsFailed=>[],                                                        # [] hrefs and conrefs from L<fixRefs|/fixRefs which were moved to the "xtrf" attribute as requested by the L<fixBadHrefs|/fixBadHrefs> attribute because the reference was invalid and could not be improved by L<deguidization|/deguidize>.
    fixedRefsNoAction=>[],                                                      # [] hrefs and conrefs from L<fixRefs|/fixRefs for which no action was taken.
    fixedRefsGB=>[],                                                            # [] files fixed to the Gearhart-Brenan file naming standard
    fixRelocatedRefs=>undef,                                                    #I Fix references to topics that have been moved around in the out folder structure assuming that all file names are unique.
    fixXrefsByTitle=>undef,                                                     #I Try to fix invalid xrefs by the Gearhart Title Method if true
    flattenFolder=>undef,                                                       #I Files are renamed to the Gearhart standard and placed in this folder if set.  References to the unflattened files are updated to references to the flattened files.  This option will eventually be deprecated as the Dita::GB::Standard is now fully available allowing files to be easily flattened before being processed by Xref.
    flattenFiles=>{},                                                           # {old full file name} = file renamed to Gearhart-Brenan file naming standard
    goodBookMaps=>{},                                                           # Good book maps.
    goodConRefs=>{},                                                            # Good con refs - by file.
    goodConRefsList=>{},                                                        # Good con refs - all.
    goodGuidHrefs=>{},                                                          # {file}{href}{location}++ where a href that starts with GUID- has been correctly resolved.
    goodImageRefs=>{},                                                          # Consolidated images found.
    goodNavTitles=>{},                                                          # Details of nav titles that were resolved
    goodTopicRefs=>{},                                                          # Good topic refs.
    goodXRefs=>{},                                                              # Good xrefs - by file.
    goodXRefsList=>{},                                                          # Good xrefs - all.
    guidHrefs=>{},                                                              # {file}{href} = location where href starts with GUID- and is thus probably a guid.
    guidToFile=>{},                                                             # {topic id which is a guid} = file defining topic id.
    hrefUrlEncoding=>{},                                                        # Hrefs that need url encoding because they contain white space
    ids=>{},                                                                    # {file}{id}     Id definitions across all files.
    images=>{},                                                                 # {file}{href}   Count of image references in each file.
    imagesReferencedFromBookMaps=>{},                                           # {bookmap full file name}{full name of image referenced from topic referenced from bookmap}++
    imagesReferencedFromTopics=>{},                                             # {topic full file name}{full name of image referenced from topic}++
    improvements=>{},                                                           # Suggested improvements - a list of improvements that might be made.
    inputFiles=>[],                                                             # Input files from L<inputFolder|/inputFolder>.
    inputFolder=>undef,                                                         #I A folder containing the dita and ditamap files to be cross referenced.
    inputFolderImages=>{},                                                      # {full image file name} for all files in input folder thus including any images resent
    ltgt=>{},                                                                   # {text between &lt; and &gt}{filename} = count giving the count of text items found between &lt; and &gt;
    maximumNumberOfProcesses=>4,                                                #I Maximum number of processes to run in parallel at any one time.
    maxZoomIn=>undef,                                                           #I Optional hash of names to regular expressions to look for in each file
    maxZoomOut=>{},                                                             # Results from L<maxZoomIn|/maxZoomIn>  where {file name}{regular expression key name in L<maxZoomIn|/maxZoomIn>}++
    md5Sum=>{},                                                                 # MD5 sum for each input file.
    missingImageFiles=>{},                                                      # [file, href] == Missing images in each file.
    missingTopicIds=>{},                                                        # Missing topic ids.
    noHref=>{},                                                                 # Tags that should have an href but do not have one.
    notReferenced=>{},                                                          # {file name} Files in input area that are not referenced by a conref, image, topicref or xref tag and are not a bookmap.
    olBody=>{},                                                                 # The number of ol under body by file
    parseFailed=>{},                                                            # {file} files that failed to parse.
    relocatedReferencesFailed=>[],                                              # Failing references that were not fixed by relocation
    relocatedReferencesFixed=>[],                                               # Relocated references fixed
    reports=>q(reports),                                                        #I Reports folder: the cross referencer will write reports to files in this folder.
    results=>[],                                                                # Summary of results table.
    sourceFile=>undef,                                                          # The source file from which this structure was generated.
    statusLine=>undef,                                                          # Status line summarizing the cross reference.
    statusTable=>undef,                                                         # Status table summarizing the cross reference.
    summary=>1,                                                                 #I Print the summary line.
    tagCount=>{},                                                               # {file}{tags} == count of the different tag names found in the xml files.
    title=>{},                                                                  # {file} = title of file.
    titleToFile=>{},                                                            # {title}{file}++ if L<fixXrefsByTitle> is in effect
    topicIds=>{},                                                               # {file} = topic id - the id on the outermost tag.
    topicRefs=>{},                                                              # {bookmap full file name}{href}{navTitle}++ References from bookmaps to topics via appendix, chapter, topicref.
    topicsReferencedFromBookMaps=>{},                                           # {bookmap file, file name}{topic full file name}++
    validationErrors=>{},                                                       # True means that Lint detected errors in the xml contained in the file.
    vocabulary=>{},                                                             # The text of each topic shorn of attributes for vocabulary comparison.
    xRefs=>{},                                                                  # {file}{href}++ Xrefs references.
    xrefBadScope=>{},                                                           # External xrefs with no scope=external.
    xrefBadFormat=>{},                                                          # External xrefs with no format=html.
    relativePath=>undef,                                                        #I Report files relative to this path or absolutely if undefined.
    matchTopics=>undef,                                                         #I Match topics by title and by vocabulary to the specified confidence level between 0 and 1.  This operation might take some time to complete on a large corpus.
   );

  loadHash($xref, @_);                                                          # Load attributes complaining about any invalid ones

  $xref->inputFolder or confess "Please supply a value for: inputFolder";
  $xref->inputFolder =~ s(\/+\Z) (\/)gs;                                        # Cleanup path names
  $xref->inputFolder = absFromAbsPlusRel(currentDirectory, $xref->inputFolder)  # Make input folder absolute
    if $xref->inputFolder !~ m(\A/);

  if (my $r = $xref->relativePath)                                              # Make relativePath absolute
   {$r =~ s(/*\Z) (/)s;
    if ($r !~ m(\A/\Z)s)
     {$r = absFromAbsPlusRel(currentDirectory, $r);
     }
    $xref->relativePath = $r;
   }

  my @phases = (q(loadInputFiles),                                              # All non topic matching reports
                q(analyze),
                q(reportXml1),
                q(reportXml2),
                q(reportDuplicateIds),
                q(reportDuplicateTopicIds),
                q(reportNoHrefs),
                q(reportXrefs),
                q(reportTopicRefs),
                q(reportTables),
                q(reportConrefs),
                q(reportImages),
                q(reportParseFailed),
                q(reportAttributeCount),
                q(reportAttributeNamesAndValuesCount),
                q(reportLtGt),
                q(reportTagCount),
                q(reportDocTypeCount),
                q(reportFileExtensionCount),
                q(reportFileTypes),
                q(reportValidationErrors),
                q(reportBookMaps),
                q(reportGuidHrefs),
                q(reportGuidsToFiles),
                q(reportExternalXrefs),
                q(reportPossibleImprovements),
                q(reportMaxZoomOut),
                q(reportTopicDetails),
                q(reportTopicReuse),
                q(reportMd5Sum),
                q(reportOlBody),
                q(reportHrefUrlEncoding),
                q(reportFixRefs),
                q(reportReferencesFromBookMaps),
               );

  if ($xref->addNavTitles)                                                      # Add nav titles to bookmaps if requested
   {push @phases, q(addNavTitlesToMaps);
   }

  if ($xref->fixBadRefs or $xref->deguidize or $xref->fixRelocatedRefs or       # Fix files if requested - done here so that files can be written back for flattening if requested,
      $xref->fixXrefsByTitle)                                                   # Attempt to fix xrefs using the Gearhart Title Method.
   {push @phases, q(fixFiles);
   }

  if ($xref->flattenFolder)                                                     # Fix file names to the Gearhart-Brenan file naming standard
   {push @phases, q(fixFilesGB)
   }

  if ($xref->matchTopics)                                                       # Topic matching reports
   {push @phases, q(reportSimilarTopicsByTitle),
                  q(reportSimilarTopicsByVocabulary);
   }

  push @phases, q(reportNotReferenced);                                         # Need to account for changes made by fixFiles or FixFilesGB

  for my $phase(@phases)                                                        # Perform phases
   {lll "Phase: $phase" if $xref->debugTimes;
    $xref->$phase;
   }
  lll "Phase: end" if $xref->debugTimes;

  formattedTablesReport
   (title=>q(Reports available),
    head=><<END,
NNNN reports available on DDDD

Sorted by title
END
   file=>fpe($xref->reports, qw(reports txt)));

  if (1)                                                                        # Summarize
   {my @o;
    my $save = sub
     {my ($levels, $field, $plural, $single) = @_;
      my $n = &countLevels($levels, $xref->{$field});
      push @o, [$n,            $plural]                   if $n >  1;
      push @o, [$n, $single // ($plural =~ s(s\Z) ()gsr)] if $n == 1;
     };

    $save->(1, "badBookMaps",       q(bad book maps));                          # Status line components
    $save->(1, "badConRefs",        q(files with bad conrefs), q(file with bad conrefs));
    $save->(1, "badConRefsList",    q(bad conrefs));
    $save->(1, "badGuidHrefs",      q(invalid guid hrefs));
    $save->(1, "badImageRefs",      q(missing image files));
    $save->(1, "badTopicRefs",      q(bad topicrefs));
    $save->(1, "badTables",         q(bad tables));
    $save->(1, "badXml1",           q(bad first lines));
    $save->(1, "badXml2",           q(bad second lines));
    $save->(1, "badXRefs",          q(files with bad xrefs), q(file with bad xrefs));
    $save->(1, "badXRefsList",      q(bad xrefs));
    $save->(1, "duplicateIds",      q(duplicate ids));
    $save->(2, "duplicateTopicIds", q(duplicate topic ids));
    $save->(1, "fixedRefsFailed",   q(xtfr));                                    # Unable to resolve these references but was allowed to ameliorate them by moving them to xtfr
#   $save->(2, "improvements",      q(improvements));
    $save->(1, "missingImageFiles", q(missing image references));
    $save->(1, "missingTopicIds",   q(missing topic ids));
    $save->(1, "hrefUrlEncoding",   q(href url encoding), q(href url encoding));
    $save->(2, "noHref",            q(hrefs missing), q(href missing));
    $save->(1, "notReferenced",     q(files not referenced), q(file not referenced));
    $save->(1, "parseFailed",       q(files failed to parse), q(file failed to parse));
    $save->(2, "validationErrors",  q(validation errors)); # Needs testing
    $save->(2, "xrefBadFormat",     q(External xrefs with no format=html));
    $save->(2, "xrefBadScope",      q(External xrefs with no scope=external));

    $xref->statusLine = @o ? join " ",                                          # Status line
      "Xref:", join ", ",
               map {join " ", @$_}
               sort
                {return $$a[1] cmp $$b[1] if $$b[0] == $$a[0];
                 $$b[0] <=> $$a[0]
                }
               @o : q();

    $xref->statusTable = formatTable
     ([sort {$$b[0] <=> $$a[0]} @o], [qw(Count Condition)]);                    # Summary in status form
    $xref->results = \@o;                                                       # Save status line components

    if (@o and $xref->summary)                                                  # Summary line
     {say STDERR $xref->statusLine;
     }
   }

  $xref                                                                         # Return Xref results
 }

sub countLevels($$)                                                             #P Count has elements to the specified number of levels
 {my ($l, $h) = @_;                                                             # Levels, hash
  if ($l <= 1)
   {return scalar keys @$h if ref($h) =~ m(array)i;
    return scalar keys %$h if ref($h) =~ m(hash)i;
   }
  my $n = 0;
  if   (ref($h) =~ m(hash)i)
   {$n += &countLevels($l-1, $_) for values %$h;
   }
  elsif (ref($h) =~ m(array)i)
   {$n += &countLevels($l-1, $_) for values @$h;
   }
  $n
 }

sub loadInputFiles($)                                                           #P Load the names of the files to be processed
 {my ($xref) = @_;                                                              # Cross referencer
  my @in = $xref->inputFiles =
   [searchDirectoryTreesForMatchingFiles
    $xref->inputFolder, @{$xref->fileExtensions}];

  if (@in == 0)                                                                 # Complain if there are no input files to analyze
   {my $i = $xref->inputFolder;
    my $e = join " ", @{$xref->fileExtensions};
    my $x = -d $i ? "The input folder does exist." :
                    "The input folder does NOT exist!";
    lll "No files with the specified file extensions",
        " in the specified input folder:\n",
        "$e\n$i\n$x";
   }

  my @images = searchDirectoryTreesForMatchingFiles($xref->inputFolder);        # Input files
  $xref->inputFolderImages = {map {fn($_), $_} @images};                        # Image file name which works well for images because the md5 sum in their name is probably unique
 }

sub analyzeOneFile($$)                                                          #P Analyze one input file
 {my ($Xref, $iFile) = @_;                                                      # Xref request, File to analyze
  my $xref = bless {};                                                          # Cross referencer for this file
     $xref->sourceFile = $iFile;                                                # File analyzed
  my @improvements;                                                             # Possible improvements
  my %maxZoomIn = $Xref->maxZoomIn ?  %{$Xref->maxZoomIn} : ();                 # Regular expressions from maxZoomIn to look for text
  my %maxZoomOut;                                                               # Text elements that match a maxZoomIn regular expression
  my $changes;                                                                  # Changes made to the file

  $xref->md5Sum->{$iFile} = fileMd5Sum($iFile);                                 # Md5 sum for input file

# my $x = eval {Data::Edit::Xml::new($iFile, lineNumbers=>1)};                  # Parse xml
  my $x = eval {Data::Edit::Xml::new($iFile)};                                  # Parse xml - at this point if the caller is interested in line numbers they should have added them.

  if ($@)
   {$xref->parseFailed->{$iFile}++;
    return $xref;
   }

  $x->by(sub                                                                    # Each node
   {my ($o) = @_;

    my $content = sub                                                           #P First few characters of content on one line to avoid triggering multi table layouts
     {my ($o) = @_;                                                             # String
      nws($o->stringContent, improvementLength);
     };

    my $loc = sub                                                               #P Location
     {my ($o) = @_;                                                             # String
      ($o->lineLocation, $iFile)
     };

    my $tag = -t $o;

    if (my $i = $o->id)                                                         # Id definitions
     {$xref->ids->{$iFile}{$i}++;
     }

    if ($tag eq q(xref))                                                        # Xrefs but not to the web
     {if (my $h = $o->href)
       {if ($h =~ m(\A(https?://|mailto:|www.))i)                               # Check attributes on external links
         {if ($o->attrX_scope !~ m(\Aexternal\Z)s)
           {$xref->xrefBadScope->{$iFile}{$h} = -A $o;
           }
          if ($o->attrX_format !~ m(\Ahtml\Z)s)
           {$xref->xrefBadFormat->{$iFile}{$h} = -A $o;
           }
         }
        elsif ($h =~ m(\Aguid-)is)                                              # Href is a guid
         {$xref->guidHrefs->{$iFile}{$h} = [$tag, $o->lineLocation];
         }
        else #if ($o->attrX_format =~ m(\Adita)i)                               # Check xref has format=dita AW83 at 2018.12.13 01:10:33
         {$xref->xRefs->{$iFile}{$h}{$o->stringText}++;
         }
       }
      else
       {push @{$xref->noHref->{$iFile}}, [$tag, $o->lineLocation, $iFile];      # No href
       }
     }
    elsif ($tag =~ m(\A(appendix|chapter|link|mapref|notices|topicref)\Z)is)    # References from bookmaps
     {if (my $h = $o->href)
       {if ($h !~ m(uacommon/collections/r_ng_legal_notices)s)                  ####TEST####
         {if ($h =~ m(\Aguid-)is)                                               # Href is a guid
           {$xref->guidHrefs->{$iFile}{$h} = [$tag, $o->lineLocation];
           }
          else
           {$xref->topicRefs->{$iFile}{$h}{$o->attr_navtitle//$o->stringText}++;
           }
         }
       }
      else
       {push @{$xref->noHref->{$iFile}}, [$tag, $o->lineLocation, $iFile];      # No href
       }
     }
    elsif ($tag eq q(image))                                                    # Images
     {if (my $h = $o->href)
       {if ($h =~ m(\Aguid-)is)                                                 # Href is a guid
         {$xref->guidHrefs->{$iFile}{$h} = [$tag, $o->lineLocation];            # Resolve image later
         }
        else
         {$xref->images->{$iFile}{$h}++;
         }
        $xref->imagesReferencedFromTopics->{$iFile}{$h}++;                      # Image referenced from a bookmap
       }
      else
       {push @{$xref->noHref->{$iFile}}, [$tag, $o->lineLocation, $iFile];      # No href
       }
     }

    if (my $conref = $o->attr_conref)                                           # Conref
     {$xref->conRefs->{$iFile}{$conref}++;
     }

    if (my $conref = $o->attr_conrefend)                                        # Conref end
     {$xref->conRefs->{$iFile}{$conref}++;
     }

    if ($o->isText_p)                                                           # Notes
     {my $t = nws($o->text, improvementLength);
      if ($t =~ m(\b(Attention|Caution|Danger|Fastpath|Important|Notice|Note|Remember|Restriction|Tip|Trouble|Warning)\b)is)
       {push @improvements, ["Note", $t, &$loc];
       }
     }
    elsif ($tag eq q(required-cleanup))                                         # Required cleanup
     {my $t = &$content;
      push @improvements, [-t $o, $t, &$loc];
     }
    elsif ($tag eq q(steps-unordered))                                          # Steps unordered
     {my $t = nws(-c $o, improvementLength);
      push @improvements, [-t $o, $t, &$loc];
     }
    elsif ($tag eq q(p))                                                        # Paragraphs with lots of bold
     {my $n = my @c = $o->c_b;
      if ($n >= 3)
       {my $t = &$content;
        push @improvements,
         [q(More than 3 bold in p), $t, &$loc];
       }
     }
    elsif ($tag eq q(title) and $o->parent == $x)                               # Title
     {my $t = $o->stringContent;
      $xref->title->{$iFile} = $t;                                              # Topic Id

      if (my $p = $o->parent)
       {if (my ($w) = split /\s+/, $t, 2)
         {my $task = $w =~ m(\AHow|ing\Z)is;                                    # How/ing concept/task

          if ($p->at_concept && $task)
           {push @improvements, [q(Better as task?),    $t, &$loc];
           }
          elsif ($p->at_task && !$task)
           {push @improvements, [q(Better as concept?), $t, &$loc];
           }
         }
       }
     }
    elsif ($o->at_mainbooktitle_booktitle_bookmap)                              # Title for bookmaps
     {my $t = $xref->title->{$iFile} //= $o->stringText;
     }
    elsif ($tag eq q(author))                                                   # Author
     {$xref->author->{$iFile} = my $t = &$content;
     }
    elsif ($tag eq q(ol))                                                       # Ol
     {if (my $p = $o->parent)
       {if ($p->tag =~ m(body\Z)s)
         {$xref->olBody->{$iFile}++;
         }
       }
     }
    elsif ($tag eq q(tgroup))                                                   # Tgroup cols
     {my $error = sub                                                           # Table error message
       {push @{$xref->badTables},
         [join('', @_), $tag, $o->lineLocation, $iFile];
       };

      my $stats     = $o->ditaTGroupStatistics;                                 # Statistics for table
      my $cols      = $stats->colsAttribute;
      my $maxCols   = max($stats->maxHead//0, $stats->maxBody//0);
      my $maxColsMP = max($stats->maxHeadMinusPadding//0,
                          $stats->maxBodyMinusPadding//0);
      if (($stats->maxHead//0) == $maxCols &&                                   # The right combination of body and header
          ($stats->minHead//0) == $maxCols &&
          ($stats->maxBody//0) == $maxCols &&
          ($stats->minBody//0) == $maxCols &&
           $stats->colSpec     == $maxCols
       or !defined($stats->maxHead)        &&                                   # No headers but everything else looks good
          ($stats->maxBody//0) == $maxCols &&
          ($stats->minBody//0) == $maxCols &&
           $stats->colSpec     == $maxCols)
       {if (!$cols)                                                             # Check for cols attribute
         {$error->(qq(No cols attribute, should be $maxCols));
         }
        elsif ($cols ne $maxCols)                                               # Cols present but wrong
         {$error->(qq(Cols attribute is $cols but should be $maxCols));
         }
       }
      elsif (($stats->maxHead//0) < $maxColsMP)                                 # Not enough headers
       {$error->(qq(Not enough headers));
       }
      else
       {$error->(qq(Column padding required));
       }
     }
    elsif (keys %maxZoomIn and $o->isText)                                      # Search for text using Micaela's Max Zoom In Method
     {my $t = $o->text;
      for my $name(sort keys %maxZoomIn)                                        # Each regular expression to check
       {my $re = $maxZoomIn{$name};
        if ($t =~ m($re)is)
         {$maxZoomOut{$name}++
         }
       }
     }

    if (my $h = $o->href)                                                       # Check href for url encoding needed
     {if ($h =~ m(\s)s)
       {$xref->hrefUrlEncoding->{$iFile}{$o->lineLocation} = $h;
       }
      if ($xref->deguidize and $h =~ m(\bguid-)is)                              # Deguidizing a href that looks as if it might have a guid in it
       {$xref->fixRefs->{$iFile}{$h}++
       }
     }

    if ($o->isText)                                                             # Check text for interesting constructs
     {my $t = $o->text;
      my @l = $t =~ m(&lt;(.*?)&gt;)g;
      for my $l(@l)
       {$xref->ltgt->{$iFile}{$l}++;
       }
     }
   });

  push @{$xref->improvements->{$iFile}}, @improvements if @improvements;        # Save improvements
  $xref->maxZoomOut->{$iFile} = \%maxZoomOut;                                   # Save max zoom

  $xref->topicIds                    ->{$iFile} = $x->id;                       # Topic Id
  $xref->docType                     ->{$iFile} = $x->tag;                      # Document type
  $xref->attributeCount              ->{$iFile} = $x->countAttrNames;           # Attribute names
  $xref->attributeNamesAndValuesCount->{$iFile} = $x->countAttrNamesAndValues;  # Attribute names and values
  $xref->baseTag                     ->{$iFile} = $x->tag;                      # Tag on base node
  $xref->tagCount                    ->{$iFile} = $x->countTagNames;            # Tag names
  $xref->vocabulary                  ->{$iFile} = $x->stringTagsAndText;        # Text of topic minus attributes

  if (1)                                                                        # Check xml headers and lint errors
   {my @h = split /\n/, my $s = readFile($iFile);
    if (!$h[0] or $h[0] !~ m(\A<\?xml version=\"1.0\" encoding=\"UTF-8\"\?>\Z))
     {$xref->badXml1->{$iFile}++;
     }
    my $tag = $x->tag;
    if (!$h[1] or $h[1] !~ m(\A<!DOCTYPE $tag PUBLIC "-//))
     {$xref->badXml2->{$iFile}++;
     }

    $xref->validationErrors->{$iFile}++ if $s =~ m(<!--compressedErrors:)s;     # File has validation errors
   }

  $xref
 } # analyzeOneFile

sub reportGuidsToFiles($)                                                       #P Map and report guids to files
 {my ($xref) = @_;                                                              # Xref results
  my @r;
  for   my $file(sort keys %{$xref->topicIds})                                  # Each input file which will be absolute
   {if (my $topicId = $xref->topicIds->{$file})                                 # Topic Id for file - we report missing topicIds in: reportDuplicateTopicIds
     {next unless $topicId =~ m(\AGUID-)is;
      $xref->guidToFile->{$topicId} = $file;                                    # Guid Topic Id to file
      push @r, [$topicId, $file];
     }
   }

  formatTable(\@r, <<END,
Guid The guid being defined
File The file that defines the guid as a relative file name
END
    title    =>qq(Guid topic definitions),
    head     =>qq(Xref found NNNN guid topic definitions on DDDD),
    summarize=>1,
    file     =>fpe($xref->reports, q(lists), qw(guidsToFiles txt)));
 }

sub editXml($$$)                                                                #P Edit an xml file
 {my ($in, $out, $x) = @_;                                                      # Input file, output file, parse tree
  my $s = readFile($in);
  my ($l1, $l2)      = split m/\n/, $s, 3;                                      # Header lines
  my (undef, $lint)  = split m/(?=\<\!\-\-linted\:)/, $s, 2;                    # Lint comments
  my $t = (-p $x).($lint ? qq(\n$lint) : q());
  if ($l1 =~ m(\A<\?xml))                                                       # Check headers - should be improved
   {owf($out, qq($l1\n$l2\n$t));
   }
  else
   {owf($out, $t);
   }
 }

=pod

Fix a file by moving its hrefs and conrefs to the xtrf attribute unless
deguidization is in effect and the guid can be converted into a valid Dita
reference accessing a file in the input corpus.

If fixRelocatedRefs is in effect: such references are fixed by assuming that
the files mentioned in broken links have been relocated else where in the
elsewhere in the folder structure and can be located by base file name alone.

If fixXrefsByTitle is in effect apply the Gearhart Title Method: fix broken
xrefs by looking for topics with the same title text as the content of the
xref.

=cut

sub fixOneFile($$)                                                              #P Fix one file by moving unresolved references to the xtrf attribute
 {my ($xref, $file) = @_;                                                       # Xref results, file to fix
  my @bad;                                                                      # Hrefs that could not be fixed and so were ameliorated by moving them to @xtrf
  my @good;                                                                     # Hrefs that were fixed by resolving a Guid
  my @none;                                                                     # Nodes where an href was expected but none was found
  my @fixRelRefsFixed;                                                          # References that were fixed by fixRelocatedRefs
  my @fixRelRefsFailed;                                                         # References that were not fixed by fixRelocatedRefs

  my %baseFiles;                                                                # Map base files back to full files
  my $fixRR = $xref->fixRelocatedRefs;

  if ($fixRR)                                                                   # Load base file name to full name but if needed to do relocation fixes
   {#for my $file(@{$xref->inputFiles})                                         # All input files
    for my $file(searchDirectoryTreesForMatchingFiles($xref->inputFolder))      # All input files
     {my $base = fne $file;
      $baseFiles{$base}{$file}++;
     }
   }

  my $fixXrefByTitle = sub                                                      # Attempt to fix an xref by using its text to search for a matching title
   {my ($xref, $file, $o, $h, $attr) = @_;                                      # Xref results, file to fix, node to fix, href to fix, name of the attr the href occurs on
    return undef unless -t $o eq q(xref);                                       # Only works for xrefs

    if (my $topics = $xref->titleToFile->{nws(-C $o)})                          # Find the topics that match the title text
     {if (keys %$topics == 1)                                                   # Unique matching topic
       {my ($path) = keys %$topics;
        my $rel    = relFromAbsAgainstAbs($path, $file);                        # Relative file name
        $o->href   = $rel;                                                      # Update xref
       }
      return 1;                                                                 # Link successfully updated
     }
    undef                                                                       # Failed
   };

  my $fixRelRef = sub                                                           # Attempt to fix a reference broken by relocation
   {my ($xref, $file, $o, $h, $attr) = @_;                                      # Xref results, file to fix, node to fix, href to fix, name of the attr the href occurs on
    my ($R, $rest) = split m(#)s, $h;                                           # Get referenced file name
    if ($R)
     {my $r = fne($R);                                                          # Href file base name
      if (my $F = $baseFiles{$r})                                               # Relocated else where
       {my @targets = sort keys(%$F);                                           # Relocation targets
        if (@targets == 1)                                                      # Just one such relocation
         {my $f = relFromAbsAgainstAbs($targets[0], $file);                     # Link to it
          if ($f ne $R)
           {my $newLink;                                                        # Fix if the target is else where
            if ($rest)                                                          # Link has more than one component
             {$o->set($attr=>($newLink = $f.q(#).$rest));                       # Reset link
             }
            else                                                                # Link has just one component
             {$o->set($attr=>($newLink = $f));                                  # Reset link
             }
            push @fixRelRefsFixed, [$newLink, $h, $file];
            return 1;                                                           # Successful fix
           }
         }
        elsif (@targets > 1)                                                    # Too many targets indicating that the base files names have not been normalized
         {push @fixRelRefsFailed,
           ["Multiple targets", join(", ", @targets), $h, $file];
         }
       }
      else                                                                      # No target - indicating that the targeted file does not exists anywhere in the corpus
       {push @fixRelRefsFailed, ["No target", q(), $h, $file];
       }
     }
    undef                                                                       # Failed
   };

  my $fixOneRef = sub                                                           # Fix one unresolved reference by moving it to the xtrf attribute
   {my ($xref, $file, $o, $h, $attr) = @_;                                      # Xref results, file to fix, node to fix, href to fix, name of the attr the href occurs on

    return unless $xref->fixRefs->{$file}{$h};                                  # Not a fixable href

    if ($xref->deguidize and $h =~ m(GUID-)is)                                  # On a guid and deguidization allowed so given g1#g2/id convert g1 to a file name by locating the topic with topicId g2.
     {my @h = split /\s+/, $h;                                                  # There might be multiple references in the href
      my @unresolved;
      my @resolved;

      for my $ref(@h)                                                           # Each reference in the href
       {my ($guid, $rest) = split /#/, $ref;
        if (my $target = $xref->guidToFile->{$guid})                            # Target file associated with guid
         {my $link = relFromAbsAgainstAbs($target, $file);                      # Relative link
          $link .= q(#).$rest if $rest;                                         # Remainder of reference which does not change as it is not file related
          if (!@resolved)                                                       # First resolution
           {$o->set($attr=>$link);                                              # New href or conref
            push @good, [$h, $ref, $target, $file];                             # Report fix
           }
          push @resolved, $ref;
         }
        else
         {push @unresolved, $ref;
         }
       }

      if (@unresolved and $xref->fixBadRefs)                                    # Unresolved - transfer all references to xtrf so some-one else can try
       {$o->renameAttr($attr, q(xtrf));                                         # No target file for guid
        push @bad, [q(No file for Guid), $h, $file];                            # Report fix
       }
     }
    elsif ($xref->fixRelocatedRefs and &$fixRelRef(@_))                         # Try to fix as a relocated ref if possible
     {}
    elsif ($xref->fixXrefsByTitle  and &$fixXrefByTitle(@_))                    # Try to fix a missing xref by title
     {}
    elsif ($xref->fixBadRefs)                                                   # Move href to xtrf as no other fix seems possible given that we have already tried to fix it as a guid and it was reportedly not working as a standard dita reference.
     {$o->renameAttr($attr, q(xtrf));                                           # No target file for guid
      if ($xref->deguidize)
       {push @bad, [q(Not a guid, no such target), $h, $file];
       }
      else
       {push @bad, [q(No such target), $h, $file];
       }
     }
    else                                                                        # Fix not requested so href left alone
     {push @none, [q(Fix not requested), $h, $file];                            # No fix for the href has been requested so no need to report it as fixed or bad
     }
   };

  my $x = Data::Edit::Xml::new($file);                                          # Parse xml - should parse OK else otherwise how did we find out that this file needed to be fixed

  $x->by(sub                                                                    # Each node
   {my ($o) = @_;
    my $t  = $o->tag;                                                           # Tag
    if ($t =~  m(\A(appendix|chapter|image|link|topicref|xref)\Z)is)            # Hrefs that need to be fixed
     {if (my $h = $o->href)                                                     # Fix the href by moving it to xtrf
       {&$fixOneRef($xref, $file, $o, $h, q(href));                             # Fix one href
       }
      else                                                                      # No href to fix
       {push @none, [q(No Href), $h, $file];
       }
     }
    if (my $conref = $o->attr_conref)                                           # Fix a conref
     {&$fixOneRef($xref, $file, $o, $conref, q(conref));
     }
    if (my $conref = $o->attr_conrefend)                                        # Fix a conrefend
     {&$fixOneRef($xref, $file, $o, $conref, q(conrefend));
     }
   });

  if (my $fixedFolder = $xref->fixedFolder)                                     # Write the fixed file to the fixedFolder copying the doctype
   {my  $s = readFile($file);

    if ($s =~ m((<!DOCTYPE[^>]*>))s)
     {my $d = $1;
      my $t = -p $x;
      my $f = swapFolderPrefix($file, $xref->inputFolder, $fixedFolder);
      owf($f, <<END);
<?xml version="1.0" encoding="UTF-8"?>
$d
$t
END
     }
   }
  else
   {editXml($file, $file, $x);                                                  # Edit xml processed by Data::Edit::Xml::Lint
   }

  [\@good, \@bad, \@none, \@fixRelRefsFixed, \@fixRelRefsFailed]                # Return report results
 }

=pod

Only files that have something in them that needs fixing are parsed and fixed
as this saves time not processing files that do not need any work on them.

=cut

sub fixFiles($)                                                                 #P Fix files by moving unresolved references to the xtrf attribute if no other solution is available
 {my ($xref) = @_;                                                              # Xref results
  my @bad;                                                                      # Hrefs that could not be fixed and so were ameliorated by moving them to @xtrf
  my @good;                                                                     # Hrefs that were fixed by resolving a Guid
  my @none;                                                                     # Nodes where an href was expected but none was found
  my @fixRelRefsFixed;
  my @fixRelRefsFailed;

  my %titleToFile;                                                              # Map titles to files for the Gearhart Title Method
  for my $file(keys %{$xref->title})
   {$titleToFile{nws($xref->title->{$file})}{$file}++;
   }
  $xref->titleToFile = \%titleToFile;

  if (my @files = sort keys %{$xref->fixRefs})                                  # Fix files if requested
   {my @square = squareArray(@files);                                           # Divide the task

    my $ps = newProcessStarter($xref->maximumNumberOfProcesses);                # Process starter
       $ps->processingTitle   = q(Xref);
       $ps->totalToBeStarted  = scalar @square;
       $ps->processingLogFile = fpe($xref->reports, qw(log xref fix txt));

    for my $row(@square)                                                        # Each row of input files file
     {$ps->start(sub
       {my @r;                                                                  # Results
        for my $col(@$row)                                                      # Each column in the row
         {push @r, $xref->fixOneFile($col);                                     # Analyze one input file
         }
        [@r]                                                                    # Return results as a reference
       });
     }

    for my $r(deSquareArray($ps->finish))                                       # Consolidate results
     {my ($good, $bad, $none, $fRelRefsFixed, $fRelRefsFailed) = @$r;
      push @bad,  @$bad;
      push @good, @$good;
      push @none, @$none;
      push @fixRelRefsFixed,  @$fRelRefsFixed;
      push @fixRelRefsFailed, @$fRelRefsFailed;
     }
   }

  formatTable($xref->fixedRefsFailed = \@bad, <<END,                            # Report bad hrefs moved to xtrf
Reason         The reason the conref/href was not fixed
Href           The conref/href not being fixed
Source_File    The source file in which the conref/href appears
END
    summarize=>1,
    title=>qq(These failing conref/hrefs referred to files that could not be located and so were masked to \@xtrf instead),
    head=><<END,
Xref moved NNNN conref/hrefs to xtrf on DDDD
END
    file=>(fpe($xref->reports, qw(bad fixedRefs txt))));

  formatTable($xref->fixedRefsNoAction = \@none, <<END,                         # Report hrefs on which no action was taken
Reason         The reason no action was taken on the conref/href despite action being requested
Href           The conref/href on which no action was taken
Source_File    The source file in which the conref/href appears
END
    summarize=>1,
    title=>qq(No action was taken on these failing conref/hrefs despite a request that the href be fixed),
    head=><<END,
Xref took no action on NNNN conref/hrefs despite a request that the href be fixed on DDDD

See below for the readons why no action was taken on the specified conref/hrefs.
END
    file=>(fpe($xref->reports, qw(bad fixedRefsNoAction txt))));

  formatTable($xref->fixedRefs = \@good, <<END,                                 # Report hrefs which were interpreted as guids and successfully resolved
Href           The conref/href which might contain more than one reference specification
Ref            The actual reference from the conref/href that is being resolved
Target_File    The located target file
Source_File    The source file in which the conref/href appears
END
    summarize=>1,
    title=>qq(These failing conref/hrefs were reinterpreted as guids and successfully resolved),
    head=><<END,
Xref successfully resolved NNNN hrefs as guids on DDDD
END
    file=>(fpe($xref->reports, qw(good fixedRefs txt))));

  formatTable($xref->relocatedReferencesFixed = \@fixRelRefsFixed, <<END,       # Relocated references fixed
New_Reference  The newly created reference
Old_Reference  The original reference
Source_File    The source file containing the reference
END
    summarize=>1,
    title=>qq(These failing conref/hrefs were reinterpreted as relocated references and successfully resolved),
    head=><<END,
Xref successfully resolved NNNN relocated hrefs on DDDD
END
    file=>(fpe($xref->reports, qw(good relocated_references txt))));

  formatTable($xref->relocatedReferencesFailed = \@fixRelRefsFailed, <<END,     # Relocated references that were not fixed by relocation
Reason      The reason the reference could not be fixed by relocation
Targets     A list of the possible target locations for this reference if there are more than one
Reference   The reference that might be fixable by relocation
Source_File The source file containing the reference
END
    summarize=>1,
    title=>qq(These failing conref/hrefs could not be fixed by relocation),
    head=><<END,
Xref failed to relocate NNNN failing conrefs/hrefs on DDDD
END
    file=>(fpe($xref->reports, qw(bad relocated_references txt))));
 }

sub fixOneFileGB($$)                                                            #P Fix one file to the Gearhart-Brenan standard
 {my ($xref, $file) = @_;                                                       # Xref results, file to fix
  my @r;                                                                        # Hrefs changed

  my $x = Data::Edit::Xml::new($file);                                          # Parse xml - should parse OK else otherwise how did we find out that this file needed to be fixed

  $x->by(sub                                                                    # Each node
   {my ($o) = @_;
    if (my $h = $o->href)                                                       # Href encountered
     {my ($localFile, $rest) = split /#/, $h, 2;                                # Split reference
      my $fullFile = absFromAbsPlusRel($file, $localFile);                      # Full name of referenced file
      if (my $target = $xref->flattenFiles->{$fullFile})                        # Target file name
       {my $h = $o->href = $target.($rest ? qq(#$rest) : qq());                 # Reassemble href
       }
      else
       {push @r, [$h, $file];
       }
     }
   });

  editXml($file, fpf($xref->flattenFolder, $xref->flattenFiles->{$file}), $x);  # Edit xml

  \@r                                                                           # Return report of items fixed
 }

sub fixFilesGB($)                                                               #P Rename files to the Gearhart-Brenan standard
 {my ($xref) = @_;                                                              # Xref results
  my @files  = grep {!$xref->parseFailed->{$_}} sort @{$xref->inputFiles};      # Fix files that parsed if requested
  my @square = squareArray(@files);                                             # Divide the task

  my %reduceMd5;                                                                # Reduce MD5 to minimum needed
  for my $file(@files)                                                          # Target file for each input file
   {my $target = sub
     {my $t = $xref->title->{$file} // q();                                     # Title
         $t =~ s([^a-zA-Z0-9]+) (_)gs;                                          # Title reduced to basics
      my $m = $xref->md5Sum->{$file} // q();
      my $s = substr($xref->baseTag->{$file}//q(u), 0, 1);                      # First letter of tag
      join q(_), $s, firstNChars($t, maximumFileNameChars), $m;                 # The Gearhart-Brenan file naming standard
     }->();

    $xref->flattenFiles->{$file} = fpe($target, fe $file);                      # Record correspondence between existing file and standardized file name
   }

  my $ps = newProcessStarter($xref->maximumNumberOfProcesses);                  # Process starter
     $ps->processingTitle   = q(Xref Gearhart);
     $ps->totalToBeStarted  = scalar @square;
     $ps->processingLogFile = fpe($xref->reports, qw(log flatten txt));

  my @r;                                                                        # Fixes made
  for my $row(@square)                                                          # Each row of input files
   {$ps->start(sub
     {my @r;                                                                    # Results
      for my $col(@$row)                                                        # Each column in the row
       {push @r, $xref->fixOneFileGB($col);                                     # Analyze one input file
       }
      [@r]                                                                      # Return results as a reference
     });
   }

  for my $r(deSquareArray($ps->finish))                                         # Consolidate results
   {push @r, @$r;
   }

  formatTable($xref->fixedRefsGB = \@r, <<END,                                  # Report results
Href           The href being fixed
Source         The source file containing the href
END
    summarize=>1,
    title=>qq(Hrefs that can not be renamed to the Gearhart-Brenan file naming standard),
    head=><<END,
Xref failed to fix NNNN hrefs to the Gearhart-Brenan file naming standard
END
    file=>(my $f = fpe($xref->reports, qw(bad fixedRefsGB txt))));
 }

sub analyze($)                                                                  #P Analyze the input files
 {my ($xref) = @_;                                                              # Cross referencer
  my @in = @{$xref->inputFiles};                                                # Input files

  my @square = squareArray(@in);                                                # Divide the task
  my $square = @square;

  my $p = newProcessStarter($xref->maximumNumberOfProcesses);                   # Process starter
     $p->processingTitle   = q(Xref Analyze);
     $p->totalToBeStarted  = $square;
     $p->processingLogFile = fpe($xref->reports, qw(log xref analyze txt));

  for my $row(@square)                                                          # Each row of input files file
   {$p->start(sub
     {my @r;                                                                    # Results
      for my $col(@$row)                                                        # Each column in the row
       {push @r, analyzeOneFile($xref, $col);                                   # Analyze one input file
       }
      [@r]                                                                      # Return results as a reference
     });
   }
# Load results takes 4 minutes while merge takes 10 seconds for all fields
  lll "Phase: analyze load results start" if $xref->debugTimes;
  my @x = deSquareArray($p->finish);                                            # Load results
  lll "Phase: analyze load results end"   if $xref->debugTimes;

  lll "Phase: analyze merge fields start" if $xref->debugTimes;
  my @fields =                                                                  # Fields to be merged
    qw(
attributeCount
attributeNamesAndValuesCount
author
badXml1
badXml2
baseTag
conRefs
docType
fixRefs
guidHrefs
hrefUrlEncoding
ids
images
imagesReferencedFromTopics
improvements
ltgt
maxZoomOut
md5Sum
noHref
olBody
parseFailed
tagCount
title
topicIds
topicRefs
validationErrors
vocabulary
xrefBadFormat
xrefBadScope
xRefs
 );

  my $fields = @fields;
  my $q = newProcessStarter($xref->maximumNumberOfProcesses);                   # Process starter
     $q->processingTitle   = q(Xref Analyze Merge);
     $q->totalToBeStarted  = $fields;
     $q->processingLogFile = fpe($xref->reports, qw(log xref analyzeMerge txt));

  for my $field(@fields)                                                        # Merge hashes by file names which are unique - ffff
   {$q->start(sub
     {lll "Phase: analyze merge field $field start" if $xref->debugTimes;
      my $target = $xref->{$field} //= {};                                      # Field to be merged
      for my $x(@x)                                                             # mmmm Merge results from each file analyzed
       {if (my $xf = $x->{$field})
         {for my $f(keys %$xf)                                                  # Each file analyzed
           {$target->{$f} = $xf->{$f}                                           # Merge
           }
         }
       }
      lll "Phase: analyze merge field $field end" if $xref->debugTimes;
      [$field, $xref]                                                           # Return results as a reference
     });
   }

  my @merge = $q->finish;                                                       # Load results
  for my $m(@merge)
   {my ($f, $x) = @$m;
    $xref->{$f} = $x->{$f};
   }

  for my $field(                                                                # Merge arrays
    qw(badTables))
   {for my $x(@x)                                                               # mmmm Merge results from each file analyzed
     {next unless my $xf = $x->{$field};
      push @{$xref->{$field}}, @$xf;
     }
   }
  lll "Phase: analyze merge fields end" if $xref->debugTimes;
 }

sub reportDuplicateIds($)                                                       #P Report duplicate ids
 {my ($xref) = @_;                                                              # Cross referencer

  my @dups;                                                                     # Duplicate ids definitions
  for my $file(sort keys %{$xref->ids})                                         # Each input file
   {for my $id(sort keys %{$xref->ids->{$file}})                                # Each id in the file
     {my $count = $xref->ids->{$file}{$id};                                     # Number of definitions of this id in the file
      if ($count > 1)                                                           # Duplicate definition
       {push @dups, [$id, $count, $file];                                       # Save details of duplicate definition
       }
     }
   }

  $xref->duplicateIds = {map {$$_[2]=>$_} @dups};                               # All duplicates

  formatTable(\@dups, [qw(Id Count File)],
    title=>qq(Duplicate id definitions within files),
    head=><<END,
Xref found NNNN duplicate id definitions within files on DDDD

These ids are duplicated within a file, possibly because they were copied from
another part of the same file.  This report does not show ids that are the same
in different files as this is not a problem using Dita's three part addressing
scheme which requires only that the topic id be unique across all files.

Duplicate topic ids are reported in ../bad/topicIds.txt.

END
    file=>(my $f = fpe($xref->reports, qw(bad duplicateIds txt))));
 }

sub reportDuplicateTopicIds($)                                                  #P Report duplicate topic ids
 {my ($xref) = @_;                                                              # Cross referencer

  my %dups;                                                                     # Duplicate topic ids definitions
  my @dups;                                                                     # Duplicate topic ids definitions report
  my @miss;                                                                     # Missing topic id definitions report
  for my $file(sort keys %{$xref->topicIds})                                    # Each input file
   {if (my $i = $xref->topicIds->{$file})                                       # Topic Id
     {if (my $d = $dups{$i})                                                    # Duplicate topic id
       {push @dups, [$i, $file, $d];                                            # Save details of duplicate definition
       }
      else
       {$dups{$i} = $file;                                                      # Save topic id
       }
     }
    else
     {push @miss, [$file];                                                      # Missing topic id
     }
   }

  $xref->duplicateTopicIds = {map {$$_[0]=>$_} @dups};                          # All duplicates
  $xref->missingTopicIds   = {map {$$_[0]=>$_} @miss};                          # All missing

  formatTable(\@dups, [qw(TopicId File1 File2)],
    title=>qq(Duplicate topic id definitions),
    head=><<END,
Xref found NNNN duplicate topic id definitions on DDDD

File1, File2 are two files that both define TopicId

END
    file=>(fpe($xref->reports, qw(bad duplicateTopicIds txt))));

  formatTable(\@miss, [qw(File)],
    title=>qq(Topics without ids),
    head=><<END,
Xref found NNNN topics that have no topic id on DDDD

END
    file=>(fpe($xref->reports, qw(bad topicIdDefinitionsMissing txt))));
 }

sub reportNoHrefs($)                                                            #P Report locations where an href was expected but not found
 {my ($xref) = @_;                                                              # Cross referencer
  my @t;
  for my $file(sort keys %{$xref->noHref})                                      # Each input file
   {push @t,             @{$xref->noHref->{$file}};                             # Missing href details
   }

  formatTable(\@t, <<END,
Tag        A tag that should have an xref.
Location   The location of the tag that should have an xref.
File       The source file containing the tag
END
    title=>qq(Missing hrefs),
    head=><<END,
Xref found NNNN tags that should have href attributes but did not on DDDD
END
    file=>(fpe($xref->reports, qw(bad missingHrefAttributes txt))));
 }

sub reportRefs($$)                                                              #P Report bad references found in xrefs or conrefs as they have the same structure
 {my ($xref, $type) = @_;                                                       # Cross referencer, type of reference to be processed

  my @bad; my @good;                                                            # Bad xrefs.
  for   my $file(sort keys %{$xref->{${type}.q(Refs)}})                         # Each input file which will be absolute
   {my $sourceTopicId = $xref->topicIds->{$file};
    for my $href(sort keys %{$xref->{${type}.q(Refs)}{$file}})                  # Each href in the file which will be relative
     {my @text;

      if (               ref($xref->{${type}.q(Refs)}{$file}{$href}))           # xRef: Text associated with reference deemed helpful by Bill
       {@text =  sort keys %{$xref->{${type}.q(Refs)}{$file}{$href}};
        s(\s+) ( )gs for @text;                                                 # Normalize white space
       }

      if ($href =~ m(#))                                                        # Href with #
       {my ($hFile, $hId) = split m(#), $href;                                  # File, topicId components
        my ($topic, $id)  = split m(/), $hId;                                   # Topic, id
                    $id //= '';
           $topic  = $sourceTopicId if $topic eq q(.);                          # In topic reference
        my $target = $hFile ? absFromAbsPlusRel($file, $hFile) : $file;         # Target file absolute

        my $good = sub                                                          # Save a good reference
         {push @good, [$href, $target, $file];
         };

        my $bad = sub                                                           # Save a bad reference
         {my ($reason, $t) = @_;
          push @bad,
           [$reason, $href,
            $hFile, $topic, $id, $t, $sourceTopicId, $file, $target, @text];
         };

        if ($hFile and !(-e $target or -e wwwDecode($target)))                  # Check target file
         {&$bad(q(No such file), q());
         }
        elsif (my $t = $xref->topicIds->{$target})                              # Check topic id
         {if ($t eq $topic)
           {if (my $i = $xref->ids->{$target}{$id})
             {if ($i == 1)
               {&$good;
               }
              else
               {&$bad(q(Duplicate id in topic), $t);
               }
             }
            elsif ($id)
             {&$bad(q(No such id in topic), $t);
             }
            else
             {&$good;
             }
           }
          else
           {&$bad(q(Topic id does not match target topic), $t);
           }
         }
        elsif ($topic =~ m(\S)s)                                                # The href contains a topic id but there is not topic with that id
         {&$bad(q(No topic id on topic in target file), $t);
         }
        else
         {&$good;
         }
       }
      else                                                                      # No # in href
       {my $target = absFromAbsPlusRel($file, $href);
        if (!-e $target and !-e wwwDecode($target))                             # Actual file name or www encoded file name
         {push @bad, my $p = [qq(No such file), $href,
           $target, q(), q(), q(), $sourceTopicId, $file, $target, @text];
         }
        else
         {push @good, my $p = [$href, $target, $file];
         }
       }
     }
   }

  for my $bad(@bad)                                                             # List of files to fix
   {my $href = $$bad[1];
    my $file = $$bad[7];
    $xref->fixRefs->{$file}{$href}++;
   }

  my $Type = ucfirst $type;
  $xref->{q(bad).$Type.q(Refs)}  = {map {$$_[7]=>$_} @bad};                     # Bad references
  $xref->{q(good).$Type.q(Refs)} = {map {$$_[1]=>$_} @good};                    # Good references

  @good = grep {$$_[0] =~ m(\A\w_embed)s} @good; ####TEST#### Show just hrefs that have embed in them
  for my $good(@good)
   {my (undef, $t, $s) = @$good;
    my $T = $xref->docType->{$t};
    my $S = $xref->docType->{$s};
    $$good[1] = swapFilePrefix($$good[2], $xref->inputFolder);
    pop @$good;

    if ($T eq $S)
     {unshift @$good, $T, q();
     }
    else
     {unshift @$good, $T, $S;
     }
   }

  $xref->{q(bad).$Type.q(RefsList)}  = \@bad;                                   # Bad references list
  $xref->{q(good).$Type.q(RefsList)} = \@good;                                  # Good references list

  my $in = $xref->inputFolder//'';
  formatTable(\@bad, <<END,
Reason          The reason why the conref failed to resolve
Href            The href in the source file
Href_File       The target file referenced by the href in the source files
Href_Topic_Id   The id of the topic referenced by the href in the source file
Target_Topic_Id The actual id of the topic in the target file
HRef_Id         The id of the statement in the body of the topic referenced by the href in the source file
Source_TopicId  The topic id at the top of the source file containing the bad reference
Source_File     The source file containing the reference
Target_File     The target file pointed to by the reference
Example_Text    Any text associated with the link such as the navtitle of a bad topicRef or the CDATA text of an xref.
END
    title    =>qq(Bad ${type}Refs),
    head     =>qq(Xref found NNNN Bad ${type}Refs on DDDD),
    summarize=>1, csv=>1,
    wide     =>1,
    file     =>(fpe($xref->reports, q(bad), qq(${Type}Refs), q(txt))));

  formatTable(\@good, <<END,
Target          Target topic type if different from source topic type
Source          Source topic type if different from target topic type
Href            The href in the source file
Source_File     The source file containing the xref
END
#Target_File     The target file
    title    =>qq(Good ${type}Refs),
    head     =>qq(Xref found NNNN Good $type refs on DDDD),
    file     =>(fpe($xref->reports, q(good), qq(${Type}Refs), q(txt))));
 } # reportRefs


=pod

Report on hrefs that have been guidized and mark them for fixing.  The reasons
we do not fix them here are:

 - we do not have access to a parse tree in which to fix them
 - the caller might not want them fixed
 - the caller might want to choose the fixing strategy.

Thus this report merely identifies hrefs with guids in them in line with xrefs
initial goal of reporting the state of play, while the question of actually
improving the situation is deferred until later.

=cut

sub reportGuidHrefs($)                                                          #P Report on guid hrefs
 {my ($xref) = @_;                                                              # Cross referencer

  my %guidToFile;                                                               # Map guids to files
  for   my $file(sort keys %{$xref->topicIds})                                  # Each input file containing a topic id
   {my $id = $xref->topicIds->{$file};                                          # Each href in the file which will start with guid
    next unless defined $id;
    next unless $id =~ m(\bguid-)is;    ###TEST## guid appears somewhere in href
    $guidToFile{$id} = $file;                                                   # We report duplicates in reportDuplicateTopicIds
   }

  my @bad; my @good;                                                            # Good and bad guid hrefs
  for   my $file(sort keys %{$xref->guidHrefs})                                 # Each input file which will be absolute
   {my $sourceTopicId = $xref->topicIds->{$file};
    for my $href(sort keys %{$xref->guidHrefs->{$file}})                        # Each href in the file which will start with guid
     {my ($tag, $lineLocation) = @{$xref->guidHrefs->{$file}{$href}};           # Tag of node and location in source file of node doing the referencing

      $xref->fixRefs->{$file}{$href}++ unless $xref->fixRefs->{$file}{$href};   # Avoid double counting - all guid hrefs will be fixed if we are fixing hrefs as both good and bad will fail.

      if ($href =~ m(#))                                                        # Href with #
       {my ($guid, $topic, $id) = split m(#|\/), $href, 3;                      # Guid, topic, remainder
        my $targetFile   = $guidToFile{$guid};                                  # Locate file defining guid

        if (!defined $targetFile)                                               # No definition of this guid
         {push @bad,                                                            # Report missing guid
           ["No such guid defined", $tag, $href, $lineLocation, q(),
            $sourceTopicId, $targetFile, $file];
          next;
         }

        my $targetFileId = $xref->topicIds->{$targetFile} // '';                # Actual id in target file

        my $bad = sub
         {push @bad,
           [@_, $tag, $href, $lineLocation, $targetFileId, $sourceTopicId,
            $targetFile, $file];
         };

        my $good = sub
         {push @good,
           [$href, $tag, $lineLocation, $targetFile, $file];
         };

        if (!-e $targetFile)                                                    # Existence of file
         {$bad->(q(No such file));
         }
        elsif (defined $topic)                                                  # Topic defined so it must be an xref
         {if ($topic ne $guid)
           {$bad->(q(Guid does not match topic id));
           }
          elsif (defined $id)
           {if (my $i = $xref->ids->{$targetFile}{$id})                         # Check id exists in target file
             {if ($i == 1)
               {&$good;
               }
              else
               {$bad->(q(Duplicate id in topic));
               }
             }
            $bad->(q(No such id in topic));
           }
          else
           {&$good;
           }
         }
        else
         {&$good;
         }
       }
      elsif ($tag eq q(image))                                                  # Image reference
       {my $guid = $href =~ s(guid|-) ()igsr;
        if (my $image = $xref->inputFolderImages->{$guid})
         {push @good, [$tag, $href, $lineLocation, $image, $file];
          $xref->goodImageRefs->{$image}++;                                     # Found image
         }
        else
         {push @bad, [qq(No such image guid defined), $tag, $href,
           $lineLocation, q(), $sourceTopicId, q(), $file];
         }
       }
      else                                                                      # No # in href and not an image so it must be a bookmap element
       {my $targetFile = $guidToFile{$href};
        if (!defined $targetFile)                                               # No such guid
         {push @bad, [qq(No such guid defined), $tag, $href,
           $lineLocation, q(), $sourceTopicId, q(), $file];
         }
        elsif (!-e $targetFile)                                                 # Actual file name
         {push @bad, my $p = [qq(No such file), $tag, $href,
           $lineLocation, q(), $sourceTopicId, $targetFile, $file];
         }
        elsif ($xref->fixBadRefs)                                               # The file exists and we want to fix such references
         {$xref->fixRefs->{$file}{$href}++;
         }
        else
         {push @good, [$tag, $href, $lineLocation, $targetFile, $file];
          $xref->goodTopicRefs->{$targetFile}++;                                # Mark reference as found
         }
       }
     }
   }

  for my $bad(@bad)                                                             # List of files to fix
   {my $href = $$bad[2];
    my $file = $$bad[-1];
#   $xref->fixRefs->{$file}{$href}++ unless $xref->fixRefs->{$file}{$href};     # Avoid double counting
   }

  $xref->{badGuidHrefs}  = {map {$$_[7]=>$_} @bad};                             # Bad references
  $xref->{goodGuidHrefs} = {map {$$_[4]=>$_} @good};                            # Good references

  my $in = $xref->inputFolder//'';
  formatTable(\@bad, <<END,
Reason          The reason why the href failed to resolve
Tag             The tag of the node doing the referencing
Href            The href of the node doing the referencing
Line_Location   The line location where the href occurred in the source file
Target_Topic_Id The actual id of the topic in the target file
Source_Topic_Id The topic id in the source file
Target_File     The target file
Source_file     The source file containing the reference
END
    title    =>qq(Unresolved GUID hrefs),
    head     =>qq(Xref found NNNN unresolved GUID hrefs on DDDD),
    summarize=>1,
    wide     =>1,
    file     =>(fpe($xref->reports, q(bad), qw(guidHrefs txt))));

  formatTable(\@good, <<END,
Tag             The tag containing the href
Href            The href of the node doing the referencing
Line_Location   The line location where the href occurred in the source file
Source_File     The source file containing the reference
Target_File     The target file
END
    title    =>qq(Resolved GUID hrefs),
    head     =>qq(Xref found NNNN Resolved GUID hrefs on DDDD),
    file     =>(fpe($xref->reports, q(good), qw(guidHrefs txt))));
 } # reportGuidHrefs

sub reportXrefs($)                                                              #P Report bad xrefs
 {my ($xref) = @_;                                                              # Cross referencer
  reportRefs($xref, q(x));
 }

sub reportTopicRefs($)                                                          #P Report topic refs
 {my ($xref) = @_;                                                              # Cross referencer

  my %topicIdsToFile;                                                           # All the topic ids encountered - we have already reported the duplicates so now we can assume that there are no duplicates
  for my $file(sort keys %{$xref->topicIds})                                    # Each input file
   {if (my $topicId = $xref->topicIds->{$file})                                 # Topic Id for file - we report missing topicIds in: reportDuplicateTopicIds
     {$topicIdsToFile{$topicId} = $file;                                        # Topic Id to file
     }
   }

  my @bad; my @good;                                                            # Bad xrefs
  for   my $file(sort keys %{$xref->topicRefs})                                 # Each input file
   {my $sourceTopicId = $xref->topicIds->{$file};
    for my $href(sort keys %{$xref->topicRefs->{$file}})                        # Each topic ref in the file
     {my @text;

if ($href =~ m(#)s) # We will have to do something about this if we encounter href on topic/link ref that has # in the href.
 {cluck "# in href in topic reference requires new code";
 }
#     next if $topicIdsToFile{$href};                                           # The href is satisfied by the topic id of a file containing a topic - we will assume that this has occurred as a result of renaming files and so is ok

      if (               ref($xref->topicRefs->{$file}{$href}))                 # Text associated with reference
       {@text =  sort keys %{$xref->topicRefs->{$file}{$href}};
        s(\s+) ( )gs for @text;                                                 # Normalize white space
       }

#     my $f = absFromAbsPlusRel(fullFileName($file), $href);                    # Target file absolute
      my $f = absFromAbsPlusRel($file, $href);                                  # Target file absolute
      if ($f)
       {if (!-e $f and !-e wwwDecode($f))                                       # Check target file
         {push @bad, my $p = [qq(No such file), $f, qq("$href"),
                             $sourceTopicId, $file, @text];
          $xref->fixRefs->{$file}{$href}++;
         }
        else
         {push @good, my $p = [$f, $href, $file];
         }
       }
     }
   }

  $xref->badTopicRefs  = {map {$$_[1]=>$_} @bad};                               # Bad topic references
  $xref->goodTopicRefs = {map {$$_[0]=>$_} @good};                              # Good topic references

  my $in = $xref->inputFolder//'';
  formatTable(\@bad, <<END,
Reason          Reason the topic reference failed
FullFileName    Name of the targeted file
Href            Href text
Source_Topic_Id The topic id of the file containing the bad xref
Relative_Path   The source file containing the reference as a relative file name
Absolute_Path   The source file containing the reference as an absolute file path
Example_Text    Any text bracketed by the topic ref
END
    title    =>qq(Bad topicrefs),
    head     =>qq(Xref found NNNN Bad topicrefs on DDDD),
    summarize=>1,
    wide     =>1,
    file     =>(fpe($xref->reports, qw(bad topicRefs txt))));

  formatTable(\@good, <<END,
FullFileName  The target file name
Href          The href text in the source file
Source        The source file
END
    title=>qq(Good topicrefs),
    head=>qq(Xref found NNNN Good topicrefs on DDDD),
    file=>(fpe($xref->reports, qw(good topicRefs txt))));
 }

sub reportConrefs($)                                                            #P Report bad conrefs refs
 {my ($xref) = @_;                                                              # Cross referencer
  reportRefs($xref, q(con));
 }

sub reportImages($)                                                             #P Reports on images and references to images
 {my ($xref) = @_;                                                              # Cross referencer

  my @bad;                                                                      # Bad images
  for my $file(sort keys %{$xref->images})                                      # Each input file
   {my $sourceTopicId = $xref->topicIds->{$file};
    for my $href(sort keys %{$xref->images->{$file}})                           # Each image in the file
     {my $image = absFromAbsPlusRel($file, $href);                              # Image relative to current file
      if (-e $image or -e wwwDecode($image))                                    # Actual image name or www encoded image name
       {$xref->goodImageRefs->{$image}++;                                       # Found image
       }
      else
       {push @bad, [$href, $image, $sourceTopicId, $file];                      # Missing image reference
        $xref->badImageRefs->{$image}++;                                        # Number of missing references
        $xref->fixRefs->{$file}{$href}++;
       }
     }
   }

  $xref->missingImageFiles = [@bad];                                            # Missing image file names

  formatTable([sort {$$a[0] cmp $$b[0]} @bad], <<END,
Href            Image reference in source file
Image           Targetted image name
Source_Topic_Id The topic id of the file containing the missing image
Source          The source file containing the image reference as an absolute file path

END
    title=>qq(Bad image references),
    head=>qq(Xref found NNNN bad image references on DDDD),
    summarize=>1,
    file=>(my $f = fpe($xref->reports, qw(bad imageRefs txt))));

  my $found = [map {[$xref->goodImageRefs->{$_}, $_]}
              keys %{$xref->goodImageRefs}];

  formatTable($found, <<END,
Count          Number of references to each image file found.
ImageFileName  Full image file name
END
    title=>qq(Image files),
    head=>qq(Xref found NNNN image files found on DDDD),
    file=>(fpe($xref->reports, qw(good imagesFound txt))));

  my $missing = [map {[$xref->badImageRefs->{$_}, $_]}
                 sort keys %{$xref->badImageRefs}];

  formatTable($missing, <<END,
Count          Number of references to each image file found.
ImageFileName  Full image file name
END
    title=>qq(Missing image references),
    head=>qq(Xref found NNNN images missing on DDDD),
    file=>(fpe($xref->reports, qw(bad imagesMissing txt))));
 }

sub reportParseFailed($)                                                        #P Report failed parses
 {my ($xref) = @_;                                                              # Cross referencer

  formatTable($xref->parseFailed, <<END,
Source The file that failed to parse as an absolute file path
END
    title=>qq(Files failed to parse),
    head=>qq(Xref found NNNN files failed to parse on DDDD),
    file=>(my $f = fpe($xref->reports, qw(bad parseFailed txt))));
 }

sub reportXml1($)                                                               #P Report bad xml on line 1
 {my ($xref) = @_;                                                              # Cross referencer


  formatTable([sort keys %{$xref->badXml1}], <<END,
Source  The source file containing bad xml on line
END
    title=>qq(Bad Xml line 1),
    head=>qq(Xref found NNNN Files with the incorrect xml on line 1 on DDDD),
    file=>(my $f = fpe($xref->reports, qw(bad xmlLine1 txt))));
 }

sub reportXml2($)                                                               #P Report bad xml on line 2
 {my ($xref) = @_;                                                              # Cross referencer

  formatTable([sort keys %{$xref->badXml2}], <<END,
Source  The source file containing bad xml on line
END
    title=>qq(Bad Xml line 2),
    head=>qq(Xref found NNNN Files with the incorrect xml on line 2 on DDDD),
    file=>(my $f = fpe($xref->reports, qw(bad xmlLine2 txt))));
 }

sub reportDocTypeCount($)                                                       #P Report doc type count
 {my ($xref) = @_;                                                              # Cross referencer

  my %d;
  for my $f(sort keys %{$xref->docType})
   {my $d = $xref->docType->{$f};
    $d{$d}++
   }

  formatTable(\%d, [qw(DocType)],
    title=>qq(Document types),
    head=>qq(Xref found NNNN different doc types on DDDD),
    file=>(fpe($xref->reports, qw(count docTypes txt))));
 }

sub reportTagCount($)                                                           #P Report tag counts
 {my ($xref) = @_;                                                              # Cross referencer

  my %d;
  for   my $f(sort keys %{$xref->tagCount})
   {for my $t(sort keys %{$xref->tagCount->{$f}})
     {my $d = $xref->tagCount->{$f}{$t};
      $d{$t} += $d;
     }
   }

  formatTable(\%d, [qw(Tag Count)],
    title=>qq(Tags),
    head=>qq(Xref found NNNN different tags on DDDD),
    file=>(fpe($xref->reports, qw(count tags txt))));
 }


sub reportLtGt($)                                                               #P Report items found between &lt; and &gt;
 {my ($xref) = @_;                                                              # Cross referencer

  my %d;
  for     my $f(sort keys %{$xref->ltgt})
   {for   my $t(sort keys %{$xref->ltgt->{$f}})
     {$d{$t} += $xref->ltgt->{$f}{$t};
     }
   }

  formatTable([map {[$d{$_}, nws($_)]} sort keys %d], <<END,
Count The number of times this text was found
Text  The text found between &lt; and &gt;. The white space has been normalized to make better use of the display.
END
    title=>qq(Text found between &lt; and &gt;),
    head=><<END,
Xref found NNNN different text items between &lt; and &gt; on DDDD
END
    file=>(fpe($xref->reports, qw(count ltgt txt))));
 }

sub reportAttributeCount($)                                                     #P Report attribute counts
 {my ($xref) = @_;                                                              # Cross referencer

  my %d;
  for   my $f(sort keys %{$xref->attributeCount})
   {for my $t(sort keys %{$xref->attributeCount->{$f}})
     {my $d = $xref->attributeCount->{$f}{$t};
      $d{$t} += $d;
     }
   }

  formatTable(\%d, [qw(Attribute Count)],
    title=>qq(Attributes),
    head=>qq(Xref found NNNN different attributes on DDDD),
    file=>(my $f = fpe($xref->reports, qw(count attributes txt))));
 }

sub reportAttributeNamesAndValuesCount($)                                       #P Report attribute value counts
 {my ($xref) = @_;                                                              # Cross referencer

  my %d;
  for     my $f(sort keys %{$xref->attributeNamesAndValuesCount})
   {for   my $a(sort keys %{$xref->attributeNamesAndValuesCount->{$f}})
     {for my $v(sort keys %{$xref->attributeNamesAndValuesCount->{$f}{$a}})
       {my $c =             $xref->attributeNamesAndValuesCount->{$f}{$a}{$v};
        $d{$a}{$v} += $c;
       }
     }
   }

  my @D;
  for   my $a(sort keys %d)
   {for my $v(sort keys %{$d{$a}})
     {push @D, [$d{$a}{$v}, firstNChars($v, 128), $a];                          # Otherwise the report can get surprisingly wide
     }
   }

  my @d = sort {$$a[2] cmp $$b[2]}
          sort {$$b[0] <=> $$a[0]} @D;


  formatTable(\@d, <<END,
Count     The number of  times this value occurs
Value     The value being counted
Attribute The attribute on which the value appears
END
    summarize => 1,
    title     => qq(Attribute value counts),
    head      => qq(Xref found NNNN attribute value combinations on DDDD),
    file      => (fpe($xref->reports, qw(count attributeNamesAndValues txt))));

 }

sub reportValidationErrors($)                                                   #P Report the files known to have validation errors
 {my ($xref) = @_;                                                              # Cross referencer

  formatTable([map {[$_]} sort keys %{$xref->validationErrors}], [qw(File)],
    title=>qq(Topics with validation errors),
    head=><<END,
Xref found NNNN topics with validation errors on DDDD
END
    file=>(fpe($xref->reports, qw(bad validationErrors txt))));
 }

sub checkBookMap($$)                                                            #P Check whether a bookmap is valid or not
 {my ($xref, $bookMap) = @_;                                                    # Cross referencer, bookmap

  for my $href($bookMap, sort keys %{$xref->topicRefs->{$bookMap}})             # Each topic ref in the bookmap
   {my $t = absFromAbsPlusRel($bookMap, $href);
    for my $field                                                               # Fields that report errors
     (qw(parseFailed badXml1 badXml2 badTopicRefs badXRefs
         imagesMissing badConRefs missingTopicIds
         validationErrors))
     {if ($xref->{$field}->{$t})
       {return [$field, $xref->topicIds->{$bookMap}, $bookMap, $href, $t];
       }
     }
   }
  undef                                                                         # No errors
 }

sub reportBookMaps($)                                                           #P Report on whether each bookmap is good or bad
 {my ($xref) = @_;                                                              # Cross referencer

  my @bad;
  my @good;
  for my $f(sort keys %{$xref->docType})
   {if ($xref->docType->{$f} =~ m(map\Z)s)
     {if (my $r = $xref->checkBookMap($f))
       {push @bad, $r;
       }
      else
       {push @good, [$f];
       }
     }
   }
  $xref-> badBookMaps = [@bad];                                                 # Bad bookmaps
  $xref->goodBookMaps = [@good];                                                # Good book maps

  formatTable(\@bad, <<END,
Reason          Reason bookmap failed
Source_Topic_Id The topic id of the failing bookmap
Bookmap         Bookmap source file name
Topic_Ref       Failing appendix, chapter or topic ref.
Topic_File      Targeted topic file if known
END
    title=>qq(Bookmaps with errors),
    head=><<END,
Xref found NNNN bookmaps with errors on DDDD
END
    summarize=>1,
    file=>(fpe($xref->reports, qw(bad bookMap txt))));

  formatTable(\@good, [qw(File)],
    title=>qq(Good bookmaps),
    head=><<END,
Xref found NNNN good bookmaps on DDDD
END
    file=>(fpe($xref->reports, qw(good bookMap txt))));
 }

sub reportTables($)                                                             #P Report on tables that have problems
 {my ($xref) = @_;                                                              # Cross referencer

  formatTable($xref->badTables, <<END,
Reason          Reason bookmap failed
Attributes      The tag and attributes of the table element in question
Location        The location at which the error was detected
Source_File     The file in which the error was detected
END
    title=>qq(Tables with errors),
    head=><<END,
Xref found NNNN table errors on DDDD
END
    summarize=>1,
    file=>(fpe($xref->reports, qw(bad tables txt))));
 }

sub reportFileExtensionCount($)                                                 #P Report file extension counts
 {my ($xref) = @_;                                                              # Cross referencer

  formatTable(countFileExtensions($xref->inputFolder), [qw(Ext Count)],
    title=>qq(File extensions),
    head=><<END,
Xref found NNNN different file extensions on DDDD
END
    file=>(fpe($xref->reports, qw(count fileExtensions txt))));
 }

sub reportFileTypes($)                                                          #P Report file type counts - takes too long in series
 {my ($xref) = @_;                                                              # Cross referencer

  formatTable(countFileTypes
   ($xref->inputFolder, $xref->maximumNumberOfProcesses),
   [qw(Type Count)],
    title=>qq(Files types),
    head=><<END,
Xref found NNNN different file types on DDDD
END
    file=>(my $f = fpe($xref->reports, qw(count fileTypes txt))));
 }

sub reportNotReferenced($)                                                      #P Report files not referenced by any of conref, image, topicref, xref and are not bookmaps.
 {my ($xref) = @_;                                                              # Cross referencer

# my %files = map {$_=>1}                                                       # Locate files of interest - restricted to known file extensions.
#   searchDirectoryTreesForMatchingFiles
#    ($xref->inputFolder, @{$xref->fileExtensions});

  my %files = map {$_=>1}                                                       # Locate files of interest - all files minus companion files and other control files.
    grep {m(\.\w+\Z) and !m(\.directory)}
    searchDirectoryTreesForMatchingFiles
     ($xref->inputFolder);

  my %target;                                                                   # Targets of xrefs and conrefs
  $target{$xref->{goodConRefs}{$_}[2]}++ for keys %{$xref->{goodConRefs}};
  $target{$xref->{goodXRefs}  {$_}[2]}++ for keys %{$xref->{goodXRefs}};
  my @T = sort keys %target;                                                    # Xref and Conref targets
  my @i = sort keys %{$xref->goodImageRefs},                                    # Image files
  my @t = sort keys %{$xref->goodTopicRefs},                                    # Topic Refs
  my @r = map {$$_[2]} @{$xref->fixedRefs};                                     # Files whose names have been changed as a result of deguidization
  my @g = map {$$_[1]} @{$xref->fixedRefsGB};                                   # Files whose names have been changed as a result of renaming to the GB standard

  for my $file(@i, @t,, @r, @g, @T)                                             # Remove referenced files
   {delete $files{$file};
   }

  for my $file(sort keys %{$xref->docType})                                     # Remove bookmaps from consideration as they are not usually referenced
   {my $tag = $xref->docType->{$file};
    if ($tag =~ m(\Abookmap\Z)is)
     {delete $files{$file};
     }
   }

  $xref->notReferenced = \%files;                                               # Hash of files that are not referenced

  formatTable([sort keys %files],
   [qw(FileNo Unreferenced)],
    title=>qq(Unreferenced files),
    head=><<END,
Xref found NNNN unreferenced files on DDDD.

These files are not mentioned in any conref or href attribute and are not
bookmaps.

END
    file=>(my $f = fpe($xref->reports, qw(bad notReferenced txt))));
 }

sub reportExternalXrefs($)                                                      #P Report external xrefs missing other attributes
 {my ($xref) = @_;                                                              # Cross referencer

  my @s;
  for   my $f(sort keys %{$xref->xrefBadScope})
   {my $sourceTopicId = $xref->topicIds->{$f};
    for my $h(sort keys %{$xref->xrefBadScope->{$f}})
     {my $s = $xref->xrefBadScope->{$f}{$h};
      push @s, [q(Bad scope attribute), $h, $s, $sourceTopicId, $f];
     }
   }

  for   my $f(sort keys %{$xref->xrefBadFormat})
   {my $sourceTopicId = $xref->topicIds->{$f};
    for my $h(sort keys %{$xref->xrefBadFormat->{$f}})
     {my $s = $xref->xrefBadFormat->{$f}{$h};
      push @s, [q(Bad format attribute), $h, $s, $sourceTopicId, $f];
     }
   }

  formatTable(\@s, <<END,
Reason          The reason why the xref is unsatisfactory
Href            The href attribute of the xref in question
Xref_Statement  The xref statement in question
Source_Topic_Id The topic id of the source file containing file containing the bad external xref
File            The file containing the xref statement in question
END
    title=>qq(Bad external xrefs),
    head=>qq(Xref found bad external xrefs on DDDD),
    file=>(my $f = fpe($xref->reports, qw(bad externalXrefs txt))));
 }

sub reportPossibleImprovements($)                                               #P Report improvements possible
 {my ($xref) = @_;                                                              # Cross referencer

  my @S;
  for   my $i(sort keys %{$xref->improvements})
   {push @S, @{$xref->improvements->{$i}};
   }

  my @s = sort {$$a[0] cmp $$b[0]}
          sort {$$a[3] cmp $$b[3]} @S;

  formatTable(\@s, <<END,
Improvement     The improvement that might be made.
Text            The text that suggested the improvement.
Line_Number     The line number at which the improvement could be made.
File            The file in which the improvement could be made.
END
    title=>qq(Possible improvements),
    head=><<END,
Xref found NNNN opportunities for improvements that might be
made on DDDD
END
    file=>(fpe($xref->reports, qw(improvements txt))),
    summarize=>1);
 }

sub reportMaxZoomOut($)                                                         #P Text located via Max Zoom In
 {my ($xref) = @_;                                                              # Cross referencer
  return unless my $names = $xref->maxZoomIn;                                   # No point if maxZoomIn was not specified

  my @names = (qw(File_Name Title), sort keys %$names);                         # Column Headers
  my %names = map {$names[$_]=>$_} keys @names;                                 # Assign regular expression names to columns in the output table/csv

  my @f;
  for   my $f(sort keys %{$xref->maxZoomOut // {}})                             # One row per file processed showing which regular expression names matched
   {my @n = ($f,  $xref->title->{$f});
    my $c = 0;
    for my $n(sort keys %{$xref->maxZoomOut->{$f}})
     {$n[$names{$n}] += $xref->maxZoomOut->{$f}{$n};
      ++$c;
     }
    push @f, [@n] if $c;                                                        # Only save a row if it has something in it
   }

  for   my $f(sort keys %{$xref->maxZoomOut // {}})
   {my $t = $xref->title->{$f};
    my $d = $xref->maxZoomOut->{$f};
    $xref->maxZoomOut->{$f} = {title=>$t, data=>$d};
   }

  formatTable([sort {$$a[0] cmp $$b[0]} @f], [@names],                          # Sort by file name
    title=>qq(Max Zoom In Matches),
    head=><<END,
Xref found NNNN file matches on DDDD
END
    file=>(fpe($xref->reports, qw(lists maxZoom txt))),
    summarize=>1);

  dumpFile(fpe($xref->reports, qw(lists maxZoom data)), $xref->maxZoomOut);     # Dump the search results
 }

sub reportTopicDetails($)                                                       #P Things that occur once in each file
 {my ($xref) = @_;                                                              # Cross referencer

  my @t;
  for my $f(sort @{$xref->inputFiles})
   {push @t, [$xref->docType ->{$f}//q(),
              $xref->topicIds->{$f}//q(),
              $xref->author  ->{$f}//q(),
              $xref->title   ->{$f}//q(),
              $f,
             ];
   }

  formatTable(\@t, <<END,
Tag             The outermost tag
Id              The id on the outermost tag
Author          The author of the topic
Title           The title of the topic
File            The source file name as a relative file name
END
    title=>qq(Topics),
    head=><<END,
Xref found NNNN topics on DDDD
END
    file=>(fpe($xref->reports, qw(lists topics txt))),
    summarize=>1);
 }

sub reportTopicReuse($)                                                         #P Count how frequently each topic is reused
 {my ($xref) = @_;                                                              # Cross referencer

  my %t;
  for   my $f(sort keys %{$xref->topicRefs})
   {for my $t(sort keys %{$xref->topicRefs->{$f}})
     {my $file = absFromAbsPlusRel($f, $t);
      $t{$file}{$f}++;
     }
   }
  for my $t(keys %t)                                                            # Eliminate topicrefs that are used only once
   {if (keys (%{$t{$t}}) <= 1)
     {delete $t{$t};
     }
   }

  my @t;
  for   my $t(keys %t)                                                          # Target
   {for my $s(keys %{$t{$t}})                                                   # Source
     {push @t, [scalar(keys %{$t{$t}}), $t{$t}{$s},  $t, $s];
     }
   }

  my $t = [sort {$a->[0] <=> $b->[0]}                                           # Order report
           sort {$a->[2] cmp $b->[2]}  @t];

  for   my $i(keys @$t)                                                         # Deduplicate first column from third column
   {next unless $i;
    my $a = $t->[$i-1];
    my $b = $t->[$i];
    $b->[0] = '' if $a->[2] eq $b->[2];
   }

  formatTable($t,                                                               # Format report
               <<END,
Reuse           The number of times the target topic is reused over all topics
Count           The number of times the target topic is reused in the source topic
Target          The topic that is being reused == the target of reuse
Source          The topic that is referencing the reused topic
END
    title=>qq(Topic Reuses),
    head=><<END,
Xref found NNNN topics that are currently being reused on DDDD
END
    file=>(fpe($xref->reports, qw(lists topicReuse txt))),
    zero=>1,                                                                    # Reuse is very unlikely because the matching criteria is the MD5 sum
    summarize=>1);
 }

=pod

References might need fixing either because they are invalid or because we are
deguidizing

=cut

sub reportFixRefs($)                                                            #P Report of hrefs that need to be fixed
 {my ($xref) = @_;                                                              # Cross referencer

  my @r;
  for   my $f(sort keys %{$xref->fixRefs})
   {for my $h(sort keys %{$xref->fixRefs->{$f}})
     {push @r, [$h, $f];
     }
   }

  formatTable(\@r,                                                              # Format report
               <<END,
Href            The href that contains a reference to be fixed
Source          The topic that contains the href
END
    title=>qq(References to fix),
    head=><<END,
Xref found NNNN hrefs that should be fixed on DDDD
END
    file=>(fpe($xref->reports, qw(lists fixRefs txt))),
    zero=>1,
    summarize=>1);
 }

sub reportReferencesFromBookMaps($)                                             #P Topics and images referenced from bookmaps
 {my ($xref) = @_;                                                              # Cross referencer
  my %bi;                                                                       # Bookmap to image
  my %bt;                                                                       # Bookmap to topics
  my @bi;                                                                       # Bookmap to image report
  my @bt;                                                                       # Bookmap to topic report

  my $imageRefsFromTopic = sub                                                  # Image references from a topic
   {my ($b, $t) = @_;                                                           # Book map, topic

    for my $I(sort keys %{$xref->imagesReferencedFromTopics->{$t}})             # Image href
     {my $i = absFromAbsPlusRel($t, $I);
      push @bi, my $d = [$I, -e $i ? 1 : '', $i, $t, $b];
      $bi{$b}{$i}++;                                                            # Images from bookmap
     }
   };

  for   my $b(sort keys %{$xref->topicRefs})                                    # Bookmap as that is the only kind of file containing a topic ref
   {for my $T(sort keys %{$xref->topicRefs->{$b}})                              # Topic href
     {my $t = absFromAbsPlusRel($b, $T);

      push @bt, [$T, -e $t ? 1 : '', $t, $b];                                   # Report bookmap to topic
      $bt{$b}{$t}++;                                                            # Topics from bookmap

      &$imageRefsFromTopic($b, $t);
     }

    for my $C(sort keys %{$xref->conRefs->{$b}})                                # Conref
     {my ($file) = parseDitaRef($C);
      my $t = absFromAbsPlusRel($b, $file);

      &$imageRefsFromTopic($b, $t);
     }
   }

  $xref->topicsReferencedFromBookMaps = \%bt;                                   # Topics referenced from bookmaps
  $xref->imagesReferencedFromBookMaps = \%bi;                                   # Images referenced from bookmaps

  formatTable(\@bi, <<END,                                                      # Report images
Href      The href that contains an image reference
Exists    Whether the referenced image exists or not
Image     The name of the image file
Topic     The topic that referenced the image
Bookmap   The book map that referenced the topic
END
    title=>qq(Images referenced from bookmaps),
    head=><<END,
Xref found NNNN images referenced from bookmaps via topics on DDDD
END
    file=>(fpe($xref->reports, qw(lists images_from_bookmaps txt))),
    zero=>1,
    summarize=>1);

  formatTable(\@bt, <<END,                                                      # Report topics
Href      The href that contains a topic reference
Exists    Whether the referenced topic exists or not
Topic     The topic that referenced the image
Bookmap   The book map that referenced the topic
END
    title=>qq(Topics referenced from bookmaps),
    head=><<END,
Xref found NNNN topics referenced from bookmaps via topics on DDDD
END
    file=>(fpe($xref->reports, qw(lists topics_from_bookmaps txt))),
    zero=>1,
    summarize=>1);
 }

sub reportSimilarTopicsByTitle($)                                               #P Report topics likely to be similar on the basis of their titles as expressed in the non Guid part of their file names
 {my ($xref) = @_;                                                              # Cross referencer

  my %t;
  for   my $File(@{$xref->inputFiles})                                          # Each input file
   {my $F = fn $File;
    my $f = $F =~ s([0-9a-f]{32}\Z) (_)gsr;                                     # Remove md5 sum from file name
    $t{$f}{$F}++;
   }

  for my $t(keys %t)                                                            # Eliminate files that have no similar counter parts
   {if (keys (%{$t{$t}}) <= 1)
     {delete $t{$t};
     }
   }

  my @t;
  for   my $t(keys %t)                                                          # Target
   {for my $s(keys %{$t{$t}})                                                   # Source
     {push @t, [scalar(keys %{$t{$t}}), $t, $s];
     }
   }

  my $t = [sort {$b->[0] <=> $a->[0]}                                           # Order report so that most numerous are first
           sort {$a->[1] cmp $b->[1]}  @t];

  for   my $i(keys @$t)                                                         # Deduplicate first column from third column
   {next unless $i;
    my $a = $t->[$i-1];
    my $b = $t->[$i];
    $b->[0] = '' if $a->[1] eq $b->[1];
   }

  formatTable($t,                                                               # Format report
               <<END,
Similar          The number of topics similar to this one
Prefix           The prefix of the target file names being used for matching
Source           Topics that have the current prefix
END
    title=>qq(Topic Reuses),
    head=><<END,
Xref found NNNN topics that might be similar on DDDD
END
    file=>(fpe($xref->reports, qw(lists similar byTitle txt))),
    zero=>1,
    summarize=>1);
 }

sub reportSimilarTopicsByVocabulary($)                                          #P Report topics likely to be similar on the basis of their vocabulary
 {my ($xref) = @_;                                                              # Cross referencer

  my @m = grep {scalar(@$_) > 1}                                                # Partition into like topics based on vocabulary - select the partitions with more than one element
   setPartitionOnIntersectionOverUnionOfHashStringSets
    ($xref->matchTopics, $xref->vocabulary);

  my @t;
  for my $a(@m)
   {my ($first, @rest) = @$a;
    push @t, [scalar(@$a), $first], map {[q(), $_]} @rest;
    push @t, [q(), q()];
   }

  formatTable(\@t,                                                              # Format report
               <<END,
Similar          The number of similar topics in this block
Topic            One of the similar topics
END
    title=>qq(Topics with similar vocabulary),
    head=><<END,
Xref found NNNN topics that have similar vocabulary on DDDD
END
    file=>(my $f = fpe($xref->reports, qw(lists similar byVocabulary txt))));
 }

sub reportMd5Sum($)                                                             #P Good files have short names which uniquely represent their content and thus can be used instead of their md5sum to generate unique names
 {my ($xref) = @_;                                                              # Cross referencer

  my %f;                                                                        # {short file}{md5}++ means this short file name has the specified md5 sum.  We want there to be only one md5 sum per short file name
  for my $F(sort keys %{$xref->md5Sum})
   {if (my $m = $xref->md5Sum->{$F})
     {my $f = fn $F;
      $f{$f}{$m}++;
     }
   }

  for my $f(sort keys %f)                                                       # These are the good md5 sums that are in one-to-one correspondence with short file names
   {delete $f{$f} unless keys %{$f{$f}} == 1;
   }

  my @good;                                                                     # File name matches and md5 sum matches or opposite
  my @bad;                                                                      # Md5 sum matches but file name is not equal or file name is equal but md5 differs
  for my $F(sort keys %{$xref->md5Sum})
   {if (my $m = $xref->md5Sum->{$F})
     {my $f = fn $F;
      if ($f{$f}{$m})
       {push @good, [$m, $f, $F];
       }
      else
       {push @bad, [$m, $f, $F];
       }
     }
     ### Need check for undef $m
   }

  formatTable(\@bad, <<END,
Md5_Sum           The md5 sum in question
Short_File_Name   The short name of the file
File              The file name
END
    title=>qq(Files whose short names are not bi-jective with their md5 sums),
    head=><<END,
Xref found NNNN such files on DDDD
END
    file=>(fpe($xref->reports, qw(bad shortNameToMd5Sum txt))),
    summarize=>1);

  formatTable(\@good, <<END,
Md5_Sum           The md5 sum in question
Short_File_Name   The short name of the file
File              The file name
END
    title=>qq(Files whose short names are bi-jective with their md5 sums),
    head=><<END,
Xref found NNNN such files on DDDD
END
    file=>(fpe($xref->reports, qw(good shortNameToMd5Sum txt))),
    summarize=>1);
 }

sub reportOlBody($)                                                             #P ol under body - indicative of a task
 {my ($xref) = @_;                                                              # Cross referencer

  my $select = sub                                                              # Select files with specified body
   {my ($body) = @_;
    my %b = %{$xref->olBody};
    for my $b(keys %b)
     {if (my $tag = $xref->baseTag->{$b})
       {if ($tag ne $body)
         {delete $b{$b} if $tag ne $body;
         }
       }
     }
    %b
   };

  my %c = $select->(q(conbody));

  formatTable([map {[$c{$_}, $_]} sort {$c{$b} <=> $c{$a}} sort keys %c], <<END,
Count             Number of ol under a conbody tag
File_Name         The name of the file containing an ol under conbody
END
    title=>qq(ol under conbody indicative of task),
    head=><<END,
Xref found NNNN files with ol under a conbody tag on DDDD.

ol under a conbody tag is often indicative of steps in a task.
END
    file=>(fpe($xref->reports, qw(bad olUnderConBody txt))),
    summarize=>1);

  my %t = $select->(q(taskbody));

  formatTable([map {[$t{$_}, $_]} sort {$t{$b} <=> $t{$a}} sort keys %t], <<END,
Count             Number of ol under a taskbody tag
File_Name         The name of the file containing an ol under taskbody
END
    title=>qq(ol under taskbody indicative of steps),
    head=><<END,
Xref found NNNN files with ol under a taskbody tag on DDDD.

ol under a taskbody tag is often indicative of steps in a task.
END
    file=>(fpe($xref->reports, qw(bad olUnderTaskBody txt))),
    summarize=>1);
 }

sub reportHrefUrlEncoding($)                                                    #P href needs url encoding
 {my ($xref) = @_;                                                              # Cross referencer

  my @b;
  for my $f  (sort keys %{$xref->hrefUrlEncoding})
   {for my $l(sort keys %{$xref->hrefUrlEncoding->{$f}})
     {push @b,           [$xref->hrefUrlEncoding->{$f}{$l}, $l, $f];
     }
   }

  formatTable([@b], <<END,
Href             Href that needs url encoding
Line_location    Line location
File_Name        The file containing the href that needs url encoding
END
    title=>qq(Hrefs that need url encoding),
    head=><<END,
Xref found NNNN locations where an href needs to be url encoded on DDDD.
END
    file=>(fpe($xref->reports, qw(bad hrefUrlEncoding txt))),
    summarize=>1);
 }

sub addNavTitlesToOneMap($$)                                                    #P Fix navtitles in one map
 {my ($xref, $file) = @_;                                                       # Xref results, file to fix
  my $changes = 0;                                                              # Number of successful changes
  my @r;                                                                        # Count of tags changed

  my $x = Data::Edit::Xml::new($file);                                          # Parse xml - should parse OK else otherwise how did we find out that this file needed to be fixed

  $x->by(sub                                                                    # Each node
   {my ($o) = @_;
    if ($o->at(qr(\A(appendix|chapter|topicref)\Z)is))                          # Nodes that take nv titles
     {if (my $h = $o->href)                                                     # href to target
       {if ($h =~ m(\AGUID-)is)                                                 # Target by guid
         {if (my $target = $xref->guidToFile->{$h})                             # Absolute target name
           {if (my $title = $xref->title->{$target})                            # Nav title
             {$o->set(navtitle=>$title);                                        # Set nav title
              push @r, [q(set by guid), $h, $title, $target, $file];            # Record set
              ++$changes;
             }
            else                                                                # No such target file
             {push @r, [q(No title for guid target), -A $o, $target, $file];
             }
           }
          else                                                                  # No mapping from guid to target file
           {push @r, [q(No file for guid), -A $o, $target, $file];
           }
         }
        else                                                                    # Target by file name
         {my $target = absFromAbsPlusRel($file, $h);                            # Absolute target name
          if (my $title = $xref->title->{$target})                              # Nav title
           {$o->set(navtitle=>$title);                                          # Set nav title
            push @r, [q(set), $h, $title, $target, $file];                      # Record set
            ++$changes;
           }
          else
           {push @r, [q(No title for target), -A $o, $target, $file];
           }
         }
       }
      else
       {push @r, [q(No href), -A $o, q(), $file];
       }
     }
   });

  if ($changes)                                                                 # Replace xml in source file if we changed anything successfully
   {editXml($file, $file, $x);                                                  # Edit xml
   }

  \@r                                                                           # Return report of actions taken
 }

sub addNavTitlesToMaps($)                                                       #P Add nav titles to files containing maps.
 {my ($xref) = @_;                                                              # Xref results
  my @r;                                                                        # Additions made
  my @files =
    sort
    grep  {$xref->baseTag->{$_} =~ m(map\Z)s}                                   # Files containing maps
    keys %{$xref->baseTag};                                                     # Files with any base tags

  if (@files)                                                                   # Add nav titles to files
   {my @square = squareArray(@files);                                           # Divide the task

    my $ps = newProcessStarter($xref->maximumNumberOfProcesses);                # Process starter
       $ps->processingTitle   = q(Xref navtitles);
       $ps->totalToBeStarted  = scalar @square;
       $ps->processingLogFile = fpe($xref->reports, qw(log xref navtitles txt));

    for my $row(@square)                                                        # Each row of input files file
     {$ps->start(sub
       {my @r;                                                                  # Results
        for my $col(@$row)                                                      # Each column in the row
         {push @r, $xref->addNavTitlesToOneMap($col);                           # Process one input file
         }
        [@r]                                                                    # Return results as a reference
       });
     }

    for my $r(deSquareArray($ps->finish))                                       # Consolidate results
     {push @r, @$r;
     }
   }

  my @bad;
  my @good;
  for my $r(@r)
   {if ($$r[0] =~ m(\ANo)s)
     {push @bad, $r;
     }
    else
     {shift @$r;
      push @good, $r;
     }
   }

  formatTable($xref->badNavTitles = \@bad, <<END,                               # Report bad results
Reason         The reason why a nav title was not added
Statement      The source xml statement requesting a navtitle
Title          The title of the the navtitle attribute
Target_File    The target of the href
Source_File    The source file being editted
END
    summarize=>1,
    title=>qq(Failing Nav titles),
    head=><<END,
Xref was unable to add NNNN navtitles as requested by the addNavTitles attribute on DDDD
END
    file=>(my $f = fpe($xref->reports, qw(bad navTitles txt))));

  formatTable($xref->goodNavTitles = \@good, <<END,                             # Report good results
Statement      The source xml statement requesting a navtitle
Title          The title of the the navtitle attribute
Target_File    The target of the href
Source_File    The source file being editted
END
    summarize=>1,
    title=>qq(Succeding Nav titles),
    head=><<END,
Xref was able to add NNNN navtitles as requested by the addNavTitles parameter on DDDD
END
    file=>(fpe($xref->reports, qw(good navTitles txt))));
 }

sub createSampleInputFiles($)                                                   #P Create sample input files for testing. The attribute B<inputFolder> supplies the name of the folder in which to create the sample files.
 {my ($N) = @_;                                                                 # Number of sample files
  my $in = q(in);
  clearFolder($in, 20);
  for my $n(1..$N)
   {my $o = $n + 1; $o -= $N if $o > $N;
    my $f = owf(fpe($in, $n, q(dita)), <<END);
<concept id="c$n">
  <title>Concept $n refers to $o</title>
  <conbody id="b$n">
     <xref id="x$n"  format="dita" href="$o.dita#c$o/x$o">Good</xref>
     <xref id="x$n"  format="dita" href="$o.dita#c$n/x$o">Duplicate id</xref>
     <xref id="b1$n" format="dita" href="bad$o.dita#c$o/x$o">Bad file</xref>
     <xref id="b2$n" format="dita" href="$o.dita#c$n/x$o">Bad topic id</xref>
     <xref id="b3$n" format="dita" href="$o.dita#c$o/x$n">Bad id in topic</xref>
     <xref id="g1$n" format="dita" href="$o.dita#c$o">Good 1</xref>
     <xref id="g2$n" format="dita" href="#c$o/x$o">Good 2</xref>
     <xref id="g3$n" format="dita" href="#c$o">Good 3</xref>
     <p conref="#c$n">Good conref</p>
     <p conref="#b$n">Bad conref</p>
     <image href="a$n.png"/>
     <image href="b$n.png"/>
     <ol><li/><li/></ol>
  </conbody>
</concept>
END
   }

  owf(fpe($in, qw(act1 dita)), <<END);
<concept id="guid-000">
  <title id="title">All Timing Codes Begin Here</title>
  <author>Phil</author>
  <conbody>
    <p>Note: see below</p>
    <p>Important: ignore all notes above</p>
    <image href="guid-000"/>
    <image href="guid-act1"/>
    <image href="guid-9999"/>
    <image href="act1.png"/>
    <xref/>
     <ol><li/><li/></ol>
     <ol><li/><li/></ol>
  </conbody>
</concept>
END

  owf(fpe($in, qw(act2 dita)), <<END);
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
<concept id="c2">
  <title id="title">Jumping Through Hops</title>
  <conbody>
    <section>
      <title/>
      <xref  format="dita" href="act1.dita#c1/title">All Timing Codes Begin Here</xref>
      <note  conref="act2.dita#c2/title"/>
      <xref  format="dita" href="9999#c1/title"/>
      <xref  format="dita" href="guid-000#guid-000/title"/>
      <xref  format="dita" href="guid-001#guid-001/title guid-000#guid-000/title"/>
      <xref  format="dita" href="guid-000#guid-000/title2"/>
      <xref  format="dita" href="guid-000#c1/title2"/>
      <xref  format="dita" href="guid-999#c1/title2"/>
      <xref  href="http://"/>
      <image href="act2.png"/>
      <link href="guid-000"/>
      <link href="guid-999"/>
      <link href="act1.dita"/>
      <link href="act9999.dita"/>
      <p conref="9999.dita"/>
      <p conref="bookmap.ditamap"/>
      <p conref="bookmap2.ditamap"/>
    </section>
    <required-cleanup>PLEX18</required-cleanup>
  </conbody>
</concept>
<!--linted: 2018-Nov-23 -->
END

  owf(fpe($in, qw(act3 dita)), <<END);
<concept id="c3">
  <title>Jumping Through Hops</title>
  <conbody>
    <p/>
  </body>
</concept>
END

  owf(fpe($in, qw(act4 dita)), <<END);
<concept id="c4">
  <taskbody/>
</concept>
END

  owf(fpe($in, qw(act5 dita)), <<END);
<concept id="c5">
  <taskbody/>
</concept>
END

  owf(fpe($in, qw(table dita)), <<END);
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd" []>
<concept id="table">
  <title>Tables</title>
  <conbody>
    <image href="new pass.png"/>
    <table>
      <tgroup cols="1">
        <thead>
          <row>
            <entry>
              <p>Significant Event</p>
            </entry>
            <entry>
              <p>Audit Event</p>
            </entry>
          </row>
        </thead>
        <tbody>
          <row>
            <entry/>
          </row>
        </tbody>
      </tgroup>
    </table>
    <table>
      <tgroup cols="1">
        <colspec/>
        <colspec/>
        <thead>
          <row>
            <entry>aaaa</entry>
            <entry>bbbb</entry>
          </row>
        </thead>
        <tbody>
          <row>
            <entry>aaaa</entry>
            <entry>bbbb</entry>
          </row>
          <row>
            <entry>aaaa</entry>
            <entry>bbbb</entry>
          </row>
        </tbody>
      </tgroup>
    </table>
  </conbody>
</concept>
END

  owf(fpe($in, qw(map bookmap ditamap)), <<END);
<map id="m1">
  <title>Test</title>
  <chapter  href="yyyy.dita">
    <topicref href="../act1.dita">Interesting topic</topicref>
    <topicref href="../act2.dita"/>
    <topicref href="../map/r.txt"/>
    <topicref href="9999.dita"/>
    <topicref href="bbb.txt"/>
    <topicref href="guid-000"/>
    <topicref href="guid-888"/>
    <topicref href="guid-999"/>
  </chapter>
</map>
END
  owf(fpe($in, qw(map bookmap2 ditamap)), <<END);
<map id="m2">
  <title>Test 2</title>
  <chapter  href="zzzz.dita">
    <topicref href="../act1.dita">Interesting topic</topicref>
    <topicref href="../act2.dita"/>
    <topicref href="../map/r.txt"/>
    <topicref href="9999.dita"/>
    <topicref href="bbb.txt"/>
    <topicref href="guid-000"/>
    <topicref href="guid-888"/>
    <topicref href="guid-999"/>
  </chapter>
</map>
END
  owf(fpe($in, qw(map bookmap3 ditamap)), <<END);
<map id="m2">
  <title>Test 3</title>
  <chapter  href="../act3.dita"/>
  <chapter  href="../act4.dita"/>
  <chapter  href="../act5.dita"/>
</map>
END
  createEmptyFile(fpe($in, qw(a1 png)));
 }

sub createSampleInputFilesFixFolder($)                                          #P Create sample input files for testing fixFolder
 {my ($in) = @_;                                                                # Folder to create the files in
  owf(fpe($in, 1, q(dita)), <<END);
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE reference PUBLIC "-//PHIL//DTD DITA Task//EN" "concept.dtd" []>
<concept id="c1">
  <title>Concept 1 which refers to concept 2</title>
  <conbody>
     <p conref="2.dita#c2/p1"/>
     <p conref="2.dita#c2/p2"/>
     <p conref="3.dita#c2/p1"/>
     <xref href="2.dita#c2/p1"/>
     <xref href="2.dita#c2/p2"/>
     <xref href="3.dita#c2/p1"/>
  </conbody>
</concept>
END

  owf(fpe($in, 2, q(dita)), <<END);
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE reference PUBLIC "-//PHIL//DTD DITA Task//EN" "concept.dtd" []>
<concept id="c2">
  <title>Concept 2 which does not refer to anything</title>
  <conbody>
     <p id="p1">Para 1 &lt;hello&gt; aaa &lt;goodbye&gt;</p>
     <p id="p2">Para 2 &lt;hello&gt; bbb &lt;goodbye&gt;</p>
  </conbody>
</concept>
END
 }

sub createSampleInputFilesLtGt($)                                               #P Create sample input files for testing items between &lt; and &gt;
 {my ($in) = @_;                                                                # Folder to create the files in
  owf(fpe($in, 1, q(dita)), <<END);
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Task//EN" "concept.dtd" []>
<concept id="c1">
  <title>Concept 1 which refers to concept 2</title>
  <conbody>
     <p>&lt;aaa&gt; AAAA &lt;bbb&gt;</p>
  </conbody>
</concept>
END
 }

sub stripInputFolderFromHashKeys($$)                                            #P Create a duplicate of a hash but with the the specified file prefix removed from each key
 {my ($hash, $prefix) = @_;                                                     # Hash to strip, file to remove from each key
  my %new;
  for my $file(keys %$hash)
   {my $new    = swapFilePrefix($file, $prefix);
    $new{$new} = $$hash{$file};
   }
  \%new
 }

sub stripInputFolderFromHashKeys2($$)                                           #P Create a duplicate of a hash but with the the specified file prefix removed from each key
 {my ($hash, $prefix) = @_;                                                     # Hash to strip, file to remove from each key
  my $new = &stripInputFolderFromHashKeys(@_);
  for my $key(keys %$new)
   {$$new{$key} = stripInputFolderFromHashKeys($$new{$key}, $prefix);
   }
  $new
 }

#D
# podDocumentation
=pod

=encoding utf-8

=head1 Name

Data::Edit::Xml::Xref - Cross reference Dita XML, match topics and ameliorate missing references.

=head1 Synopsis

Check the references in a large corpus of Dita XML documents held in folder
L<inputFolder|/inputFolder> running processes in parallel where ever possible
to take advantage of multi-cpu computers:

  use Data::Edit::Xml::Xref;

  my $x = xref(inputFolder              => q(in),
               maximumNumberOfProcesses => 512,
               relativePath             => q(out),
               fixBadRefs               => 1,
               flattenFolder            => q(out2),
               matchTopics              => 0.9,
              );

The cross reference analysis can be requested as a status line:

  ok nws($x->statusLine) eq nws(<<END);
Xref: 108 references fixed, 50 bad xrefs, 16 missing image files, 16 missing image references, 13 bad first lines, 13 bad second lines, 9 bad conrefs, 9 duplicate topic ids, 9 files with bad conrefs, 9 files with bad xrefs, 8 duplicate ids, 6 bad topicrefs, 6 files not referenced, 4 invalid guid hrefs, 2 bad book maps, 2 bad tables, 1 External xrefs with no format=html, 1 External xrefs with no scope=external, 1 file failed to parse, 1 href missing
END

Or as a tabular report:

  ok nws($x->statusTable) eq nws(<<END);
Xref:
    Count  Condition
 1    108  references fixed
 2     50  bad xrefs
 3     16  missing image files
 4     16  missing image references
 5     13  bad first lines
 6     13  bad second lines
 7      9  files with bad conrefs
 8      9  bad conrefs
 9      9  files with bad xrefs
10      9  duplicate topic ids
11      8  duplicate ids
12      6  bad topicrefs
13      6  files not referenced
14      4  invalid guid hrefs
15      2  bad book maps
16      2  bad tables
17      1  href missing
18      1  file failed to parse
19      1  External xrefs with no format=html
20      1  External xrefs with no scope=external
END

More detailed reports are produced in the L<reports|/reports> folder:

  $x->reports

and indexed by the reports report:

  reports/reports.txt

which contains a list of all the reports generated:

    Rows  Title                                                           File
 1     5  Attributes                                                      reports/count/attributes.txt
 2    13  Bad Xml line 1                                                  reports/bad/xmlLine1.txt
 3    13  Bad Xml line 2                                                  reports/bad/xmlLine2.txt
 4     9  Bad conRefs                                                     reports/bad/ConRefs.txt
 5     2  Bad external xrefs                                              reports/bad/externalXrefs.txt
 6    16  Bad image references                                            reports/bad/imageRefs.txt
 7     9  Bad topicrefs                                                   reports/bad/topicRefs.txt
 8    50  Bad xRefs                                                       reports/bad/XRefs.txt
 9     2  Bookmaps with errors                                            reports/bad/bookMap.txt
10     2  Document types                                                  reports/count/docTypes.txt
11     8  Duplicate id definitions within files                           reports/bad/idDefinitionsDuplicated.txt
12     3  Duplicate topic id definitions                                  reports/bad/topicIdDefinitionsDuplicated.txt
13     3  File extensions                                                 reports/count/fileExtensions.txt
14     1  Files failed to parse                                           reports/bad/parseFailed.txt
15     0  Files types                                                     reports/count/fileTypes.txt
16    16  Files whose short names are bi-jective with their md5 sums      reports/good/shortNameToMd5Sum.txt
17     0  Files whose short names are not bi-jective with their md5 sums  reports/bad/shortNameToMd5Sum.txt
18   108  Fixes Applied To Failing References                             reports/lists/referencesFixed.txt
19     0  Good bookmaps                                                   reports/good/bookMap.txt
20     9  Good conRefs                                                    reports/good/ConRefs.txt
21     5  Good topicrefs                                                  reports/good/topicRefs.txt
22     8  Good xRefs                                                      reports/good/XRefs.txt
23     1  Guid topic definitions                                          reports/lists/guidsToFiles.txt
24     2  Image files                                                     reports/good/imagesFound.txt
25     1  Missing hrefs                                                   reports/bad/missingHrefAttributes.txt
26    16  Missing image references                                        reports/bad/imagesMissing.txt
27     4  Possible improvements                                           reports/improvements.txt
28     2  Resolved GUID hrefs                                             reports/good/guidHrefs.txt
29     2  Tables with errors                                              reports/bad/tables.txt
30    23  Tags                                                            reports/count/tags.txt
31    11  Topic Reuses                                                    reports/lists/topicReuse.txt
32     0  Topic Reuses                                                    reports/lists/similar/byTitle.txt
33    16  Topics                                                          reports/lists/topics.txt
34    15  Topics with similar vocabulary                                  reports/lists/similar/byVocabulary.txt
35     0  Topics with validation errors                                   reports/bad/validationErrors.txt
36     0  Topics without ids                                              reports/bad/topicIdDefinitionsMissing.txt
37     6  Unreferenced files                                              reports/bad/notReferenced.txt
38    11  Unresolved GUID hrefs                                           reports/bad/guidHrefs.txt

File names in reports can be made relative to a specified directory named on
the:

  relativePath => q(out)

attribute.

=head2 Add navigation titles to topic references

Xref will create or update the navigation titles B<navtitles> of topic refs
B<appendix|chapter|topicref> in maps if requested by both file name and GUID
reference:

  addNavTitle => 1

Reports of successful updates will be written to:

  reports/good/navTitles.txt

Reports of unsuccessful updates will be written to:

  reports/bad/navTitles.txt

=head2 Fix bad references

It is often desirable to ameliorate unresolved Dita href attributes so that
incomplete content can be loaded into a content management system.  The:

  fixBadRefs => 1

attribute requests that the:

 conref and href

attributes be renamed to:

 xtrf

if the B<conref> or B<href> attribute specification cannot be resolved in the
current corpus.

If the L<fixedFolder|/fixedFolder> attribute is set, the fixed files are
written into this folder, else they are written back into the
L<inputFolder|/inputFolder>.  Two reports are generated by this action:


  reports/bad/fixedRefs.txt

  reports/bad/fixedRefsNoAction.txt

This feature designed by L<mailto:mim@cpan.org>.

=head2 Deguidize

Some content management systems use guids, some content management systems use
file names as their means of identifying content. When moving from a guid to a
file name content management system it might be necessary to replace the guids
representing file names with the actual underlying file names.  If the

  deguidize =>1

parameter is set to true, Xref will replace any such file guids with the
underlying file name if it is present in the content being cross referenced.

=head2 File flattening

It is often desirable to flatten the topic files so that they can coexist in a
single folder of a content management system without colliding with each other.

The presence of the input attribute:

 flattenFolder=> folder-to-flatten-files-into

causes topic files to be flattened into the named folder.

Xref uses the L<GBStandard> to generate flattened file names.

=head2 Locating relocated files

File references in conref/hrefs that have a valid base file name and an invalid
path can be fixed by setting the input attribute:

 fixRelocatedRefs=>1

to a true value to request Xref to replace the incorrect path to the specified
base file  with the correct path.

If coded in conjunction with the B<fixBadRefs> input attribute this will cause
Xref to first try and fix any missing xrefs, any that still fal to resolve
will then be ameliorated by moving them to the B<xtrf> attribute.

=over

=item The first letter of the root tag of the topic.

=item The title of the topic with all runs of characters not in the ranges:

  a-z, A-Z, 0-9

reduced to a single underscore.

=item The MD5 sum in hexadecimal of the content of the topic.

=over

This has the effect of sorting files by their root tags and titles while
guaranteeing a unique name for the topic that depends only on its content.

If the content of two such files is identical then they will have an identical
file name because the generation of the file name depends only on the content
of the topic. If two topic files have the same name under this naming system
then they have identical content and only one file is needed to hold the topic
in a content management system.

=head2 Fix Xrefs by Title

Xrefs with broken or missing B<href>s can sometimes be fixed by matching the
text content of the B<xref> with the titles of topics.  If:

  fixXrefsByTitle => 1

is specified, L<Xref> will locate possible targets for a broken B<href> by
matching the white space normalized L<Data::Table::Text::nws> of the text
content of the B<xref> with the similarly normalized title of each topic.  If a
single matching candidate is located then it will be used to update the B<href>
attribute of the B<xref>.

=head2 Topic Matching

Topics can be matched on title and vocabulary to assist authors in finding
similar topics by specifying the:

  matchTopics => 0.9

attribute where the value of this attribute is the confidence level between 0
and 1.

Topic matching might take some time for large input folders.

=head3 Title matching

Title sorts topics by their titles so that topic with similar titles can be
easily located:

    Similar  Prefix        Source
 1       14  c_Notices__   c_Notices_5614e96c7a3eaf3dfefc4a455398361b
 2           c_Notices__   c_Notices_14a9f467215dea879d417de884c21e6d
 3           c_Notices__   c_Notices_19011759a2f768d76581dc3bba170a44
 4           c_Notices__   c_Notices_aa741e6223e6cf8bc1a5ebdcf0ba867c
 5           c_Notices__   c_Notices_f0009b28c3c273094efded5fac32b83f
 6           c_Notices__   c_Notices_b1480ac1af812da3945239271c579bb1
 7           c_Notices__   c_Notices_5f3aa15d024f0b6068bd8072d4942f6d
 8           c_Notices__   c_Notices_17c1f39e8d70c765e1fbb6c495bedb03
 9           c_Notices__   c_Notices_7ea35477554f979b3045feb369b69359
10           c_Notices__   c_Notices_4f200259663703065d247b35d5500e0e
11           c_Notices__   c_Notices_e3f2eb03c23491c5e96b08424322e423
12           c_Notices__   c_Notices_06b7e9b0329740fc2b50fedfecbc5a94
13           c_Notices__   c_Notices_550a0d84dfc94982343f58f84d1c11c2
14           c_Notices__   c_Notices_fa7e563d8153668db9ed098d0fe6357b
15        3  c_Overview__  c_Overview_f9e554ee9be499368841260344815f58
16           c_Overview__  c_Overview_f234dc10ea3f4229d0e1ab4ad5e8f5fe
17           c_Overview__  c_Overview_96121d7bcd41cf8be318b96da0049e73


=head3 Vocabulary matching

Vocabulary matching compares the vocabulary of pairs of topics: topics with
similar vocabularies within the confidence level specified are reported
together:

    Similar  Topic
 1        8  in/1.dita
 2           in/2.dita
 3           in/3.dita
 4           in/4.dita
 5           in/5.dita
 6           in/6.dita
 7           in/7.dita
 8           in/8.dita
 9
10        2  in/map/bookmap.ditamap
11           in/map/bookmap2.ditamap
12
13        2  in/act4. dita
14           in/act5.dita

=head1 Description

Cross reference Dita XML, match topics and ameliorate missing references.


Version 20190524.


The following sections describe the methods in each functional area of this
module.  For an alphabetic listing of all methods by name see L<Index|/Index>.



=head1 Cross reference

Check the cross references in a set of Dita files and report the results.

=head2 xref(%)

Check the cross references in a set of Dita files held in  L<inputFolder|/inputFolder> and report the results in the L<reports|/reports> folder. The possible attributes are defined in L<Data::Edit::Xml::Xref|/Data::Edit::Xml::Xref>

     Parameter    Description
  1  %attributes  Attributes

B<Example:>


  if (1) {
    clearFolder($_, 420) for qw(in out reports);
    createSampleInputFiles(8);
    my $x = 𝘅𝗿𝗲𝗳(inputFolder              => q(in),
                 deguidize                => 1,
                 fixBadRefs               => 1,
                 fixRelocatedRefs         => 1,
                 maximumNumberOfProcesses => 2,
                 matchTopics              => 0.9,
                 flattenFolder            => q(out),
                 relativePath             => q(in));

    ok nws($x->statusLine) eq nws(<<'END');
  Xref: 103 xtfr, 50 bad xrefs, 18 missing image files, 18 missing image references, 14 bad first lines, 14 bad second lines, 12 duplicate topic ids, 11 bad conrefs, 9 files with bad conrefs, 9 files with bad xrefs, 8 duplicate ids, 6 bad topicrefs, 4 invalid guid hrefs, 3 bad book maps, 3 files not referenced, 2 bad tables, 2 href url encoding, 1 External xrefs with no format=html, 1 External xrefs with no scope=external, 1 file failed to parse, 1 href missing
  END

    say STDERR $x->statusTable;

    is_deeply $x->relocatedReferencesFixed,
  [["map/bookmap.ditamap",
    "bookmap.ditamap",
    "/home/phil/perl/cpan/DataEditXmlXref/lib/Data/Edit/Xml/in/act2.dita",
   ],
   ["map/bookmap2.ditamap",
    "bookmap2.ditamap",
    "/home/phil/perl/cpan/DataEditXmlXref/lib/Data/Edit/Xml/in/act2.dita",
   ],
  ];
   }



=head2 Data::Edit::Xml::Xref Definition


Attributes used by the Xref cross referencer.




=head3 Input fields


B<debugTimes> - Write timing information if true

B<deguidize> - Set true to replace guids in dita references with file name. Given reference g1#g2/id convert g1 to a file name by locating the topic with topicId g2.  This requires the guids to be genuinely unique. SDL guids are thought to be unique by language code but the same topic, translated to a different language might well have the same guid as the original topic with a different language code: =(de|en|es|fr).  If the source is in just one language then the guid uniqueness is a reasonable assumption.  If the conversion can be done in phases by language then the uniqueness of guids is again reasonably assured. L<Data::Edit::Xml::Lint> provides an alternative solution to deguidizing by using labels to record the dita reference in the input corpus for each id encountered, these references can then be resolved in the usual manner by L<Data::Edit::Xml::Lint::relint>.

B<fixBadRefs> - Try to fix bad references in L<these files|/fixRefs> where possible by either changing a guid to a file name assuming the right file is present in the corpus nbing scanned and L<deguidize|/deguidize> has been set true or failing that by moving the failing reference to the "xtrf" attribute.

B<fixRelocatedRefs> - Fix references to topics that have been moved around in the out folder structure assuming that all file names are unique.

B<fixXrefsByTitle> - Try to fix invalid xrefs by the Gearhart Title Method if true

B<flattenFolder> - Files are renamed to the Gearhart standard and placed in this folder if set.  References to the unflattened files are updated to references to the flattened files.  This option will eventually be deprecated as the Dita::GB::Standard is now fully available allowing files to be easily flattened before being processed by Xref.

B<inputFolder> - A folder containing the dita and ditamap files to be cross referenced.

B<matchTopics> - Match topics by title and by vocabulary to the specified confidence level between 0 and 1.  This operation might take some time to complete on a large corpus.

B<maxZoomIn> - Optional hash of names to regular expressions to look for in each file

B<maximumNumberOfProcesses> - Maximum number of processes to run in parallel at any one time.

B<relativePath> - Report files relative to this path or absolutely if undefined.

B<reports> - Reports folder: the cross referencer will write reports to files in this folder.

B<summary> - Print the summary line.



=head3 Output fields


B<addNavTitles> - If true, add navtitle to topicrefs to show the title of the target

B<attributeCount> - {file}{attribute name} == count of the different xml attributes found in the xml files.

B<attributeNamesAndValuesCount> - {file}{attribute name}{value} = count

B<author> - {file} = author of this file.

B<badBookMaps> - Bad book maps.

B<badConRefs> - {sourceFile} = [file, href] indicating the file has at least one bad conref.

B<badConRefsList> - Bad conrefs - by file.

B<badGuidHrefs> - Bad conrefs - all.

B<badImageRefs> - Consolidated images missing.

B<badNavTitles> - Details of nav titles that were not resolved

B<badTables> - Array of tables that need fixing.

B<badTopicRefs> - [file, href]   Invalid href attributes found on topicref tags.

B<badXRefs> - Bad Xrefs - by file

B<badXRefsList> - Bad Xrefs - all

B<badXml1> - [Files] with a bad xml encoding header on the first line.

B<badXml2> - [Files] with a bad xml doc type on the second line.

B<baseTag> - Base Tag for each file

B<conRefs> - {file}{href}   Count of conref definitions in each file.

B<docType> - {file} == docType:  the docType for each xml file.

B<duplicateIds> - [file, id]     Duplicate id definitions within each file.

B<duplicateTopicIds> - [topicId, [files]] Files with duplicate topic ids - the id on the outermost tag.

B<fileExtensions> - Default file extensions to load

B<fixRefs> - {file}{ref} where the href or conref target is not valid.

B<fixedFolder> - Fixed files are placed in this folder if L<fixBadRefs|/fixBadRefs> has been specified.

B<fixedRefs> - [] hrefs and conrefs from L<fixRefs|/fixRefs which were invalid but have been fixed by L<deguidizing|/deguidize> them to a valid file name.

B<fixedRefsFailed> - [] hrefs and conrefs from L<fixRefs|/fixRefs which were moved to the "xtrf" attribute as requested by the L<fixBadHrefs|/fixBadHrefs> attribute because the reference was invalid and could not be improved by L<deguidization|/deguidize>.

B<fixedRefsGB> - [] files fixed to the Gearhart-Brenan file naming standard

B<fixedRefsNoAction> - [] hrefs and conrefs from L<fixRefs|/fixRefs for which no action was taken.

B<flattenFiles> - {old full file name} = file renamed to Gearhart-Brenan file naming standard

B<goodBookMaps> - Good book maps.

B<goodConRefs> - Good con refs - by file.

B<goodConRefsList> - Good con refs - all.

B<goodGuidHrefs> - {file}{href}{location}++ where a href that starts with GUID- has been correctly resolved.

B<goodImageRefs> - Consolidated images found.

B<goodNavTitles> - Details of nav titles that were resolved

B<goodTopicRefs> - Good topic refs.

B<goodXRefs> - Good xrefs - by file.

B<goodXRefsList> - Good xrefs - all.

B<guidHrefs> - {file}{href} = location where href starts with GUID- and is thus probably a guid.

B<guidToFile> - {topic id which is a guid} = file defining topic id.

B<hrefUrlEncoding> - Hrefs that need url encoding because they contain white space

B<ids> - {file}{id}     Id definitions across all files.

B<images> - {file}{href}   Count of image references in each file.

B<imagesReferencedFromBookMaps> - {bookmap full file name}{full name of image referenced from topic referenced from bookmap}++

B<imagesReferencedFromTopics> - {topic full file name}{full name of image referenced from topic}++

B<improvements> - Suggested improvements - a list of improvements that might be made.

B<inputFiles> - Input files from L<inputFolder|/inputFolder>.

B<inputFolderImages> - {full image file name} for all files in input folder thus including any images resent

B<ltgt> - {text between &lt; and &gt}{filename} = count giving the count of text items found between &lt; and &gt;

B<maxZoomOut> - Results from L<maxZoomIn|/maxZoomIn>  where {file name}{regular expression key name in L<maxZoomIn|/maxZoomIn>}++

B<md5Sum> - MD5 sum for each input file.

B<missingImageFiles> - [file, href] == Missing images in each file.

B<missingTopicIds> - Missing topic ids.

B<noHref> - Tags that should have an href but do not have one.

B<notReferenced> - {file name} Files in input area that are not referenced by a conref, image, topicref or xref tag and are not a bookmap.

B<olBody> - The number of ol under body by file

B<parseFailed> - {file} files that failed to parse.

B<relocatedReferencesFailed> - Failing references that were not fixed by relocation

B<relocatedReferencesFixed> - Relocated references fixed

B<results> - Summary of results table.

B<sourceFile> - The source file from which this structure was generated.

B<statusLine> - Status line summarizing the cross reference.

B<statusTable> - Status table summarizing the cross reference.

B<tagCount> - {file}{tags} == count of the different tag names found in the xml files.

B<title> - {file} = title of file.

B<titleToFile> - {title}{file}++ if L<fixXrefsByTitle> is in effect

B<topicIds> - {file} = topic id - the id on the outermost tag.

B<topicRefs> - {bookmap full file name}{href}{navTitle}++ References from bookmaps to topics via appendix, chapter, topicref.

B<topicsReferencedFromBookMaps> - {bookmap file, file name}{topic full file name}++

B<validationErrors> - True means that Lint detected errors in the xml contained in the file.

B<vocabulary> - The text of each topic shorn of attributes for vocabulary comparison.

B<xRefs> - {file}{href}++ Xrefs references.

B<xrefBadFormat> - External xrefs with no format=html.

B<xrefBadScope> - External xrefs with no scope=external.



=head1 Attributes


The following is a list of all the attributes in this package.  A method coded
with the same name in your package will over ride the method of the same name
in this package and thus provide your value for the attribute in place of the
default value supplied for this attribute by this package.

=head2 Replaceable Attribute List


improvementLength


=head2 improvementLength

Improvement length




=head1 Private Methods

=head2 countLevels($$)

Count has elements to the specified number of levels

     Parameter  Description
  1  $l         Levels
  2  $h         Hash

=head2 loadInputFiles($)

Load the names of the files to be processed

     Parameter  Description
  1  $xref      Cross referencer

=head2 analyzeOneFile($$)

Analyze one input file

     Parameter  Description
  1  $Xref      Xref request
  2  $iFile     File to analyze

=head2 reportGuidsToFiles($)

Map and report guids to files

     Parameter  Description
  1  $xref      Xref results

=head2 editXml($$$)

Edit an xml file

     Parameter  Description
  1  $in        Input file
  2  $out       Output file
  3  $x         Parse tree

=head2 fixOneFile($$)

Fix one file by moving unresolved references to the xtrf attribute

     Parameter  Description
  1  $xref      Xref results
  2  $file      File to fix

=head2 fixFiles($)

Fix files by moving unresolved references to the xtrf attribute if no other solution is available

     Parameter  Description
  1  $xref      Xref results

=head2 fixOneFileGB($$)

Fix one file to the Gearhart-Brenan standard

     Parameter  Description
  1  $xref      Xref results
  2  $file      File to fix

=head2 fixFilesGB($)

Rename files to the Gearhart-Brenan standard

     Parameter  Description
  1  $xref      Xref results

=head2 analyze($)

Analyze the input files

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportDuplicateIds($)

Report duplicate ids

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportDuplicateTopicIds($)

Report duplicate topic ids

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportNoHrefs($)

Report locations where an href was expected but not found

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportRefs($$)

Report bad references found in xrefs or conrefs as they have the same structure

     Parameter  Description
  1  $xref      Cross referencer
  2  $type      Type of reference to be processed

=head2 reportGuidHrefs($)

Report on guid hrefs

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportXrefs($)

Report bad xrefs

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportTopicRefs($)

Report topic refs

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportConrefs($)

Report bad conrefs refs

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportImages($)

Reports on images and references to images

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportParseFailed($)

Report failed parses

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportXml1($)

Report bad xml on line 1

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportXml2($)

Report bad xml on line 2

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportDocTypeCount($)

Report doc type count

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportTagCount($)

Report tag counts

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportLtGt($)

Report items found between &lt; and &gt;

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportAttributeCount($)

Report attribute counts

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportAttributeNamesAndValuesCount($)

Report attribute value counts

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportValidationErrors($)

Report the files known to have validation errors

     Parameter  Description
  1  $xref      Cross referencer

=head2 checkBookMap($$)

Check whether a bookmap is valid or not

     Parameter  Description
  1  $xref      Cross referencer
  2  $bookMap   Bookmap

=head2 reportBookMaps($)

Report on whether each bookmap is good or bad

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportTables($)

Report on tables that have problems

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportFileExtensionCount($)

Report file extension counts

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportFileTypes($)

Report file type counts - takes too long in series

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportNotReferenced($)

Report files not referenced by any of conref, image, topicref, xref and are not bookmaps.

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportExternalXrefs($)

Report external xrefs missing other attributes

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportPossibleImprovements($)

Report improvements possible

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportMaxZoomOut($)

Text located via Max Zoom In

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportTopicDetails($)

Things that occur once in each file

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportTopicReuse($)

Count how frequently each topic is reused

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportFixRefs($)

Report of hrefs that need to be fixed

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportReferencesFromBookMaps($)

Topics and images referenced from bookmaps

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportSimilarTopicsByTitle($)

Report topics likely to be similar on the basis of their titles as expressed in the non Guid part of their file names

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportSimilarTopicsByVocabulary($)

Report topics likely to be similar on the basis of their vocabulary

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportMd5Sum($)

Good files have short names which uniquely represent their content and thus can be used instead of their md5sum to generate unique names

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportOlBody($)

ol under body - indicative of a task

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportHrefUrlEncoding($)

href needs url encoding

     Parameter  Description
  1  $xref      Cross referencer

=head2 addNavTitlesToOneMap($$)

Fix navtitles in one map

     Parameter  Description
  1  $xref      Xref results
  2  $file      File to fix

=head2 addNavTitlesToMaps($)

Add nav titles to files containing maps.

     Parameter  Description
  1  $xref      Xref results

=head2 createSampleInputFiles($)

Create sample input files for testing. The attribute B<inputFolder> supplies the name of the folder in which to create the sample files.

     Parameter  Description
  1  $N         Number of sample files

=head2 createSampleInputFilesFixFolder($)

Create sample input files for testing fixFolder

     Parameter  Description
  1  $in        Folder to create the files in

=head2 createSampleInputFilesLtGt($)

Create sample input files for testing items between &lt; and &gt;

     Parameter  Description
  1  $in        Folder to create the files in


=head1 Index


1 L<addNavTitlesToMaps|/addNavTitlesToMaps> - Add nav titles to files containing maps.

2 L<addNavTitlesToOneMap|/addNavTitlesToOneMap> - Fix navtitles in one map

3 L<analyze|/analyze> - Analyze the input files

4 L<analyzeOneFile|/analyzeOneFile> - Analyze one input file

5 L<checkBookMap|/checkBookMap> - Check whether a bookmap is valid or not

6 L<countLevels|/countLevels> - Count has elements to the specified number of levels

7 L<createSampleInputFiles|/createSampleInputFiles> - Create sample input files for testing.

8 L<createSampleInputFilesFixFolder|/createSampleInputFilesFixFolder> - Create sample input files for testing fixFolder

9 L<createSampleInputFilesLtGt|/createSampleInputFilesLtGt> - Create sample input files for testing items between &lt; and &gt;

10 L<editXml|/editXml> - Edit an xml file

11 L<fixFiles|/fixFiles> - Fix files by moving unresolved references to the xtrf attribute if no other solution is available

12 L<fixFilesGB|/fixFilesGB> - Rename files to the Gearhart-Brenan standard

13 L<fixOneFile|/fixOneFile> - Fix one file by moving unresolved references to the xtrf attribute

14 L<fixOneFileGB|/fixOneFileGB> - Fix one file to the Gearhart-Brenan standard

15 L<loadInputFiles|/loadInputFiles> - Load the names of the files to be processed

16 L<reportAttributeCount|/reportAttributeCount> - Report attribute counts

17 L<reportAttributeNamesAndValuesCount|/reportAttributeNamesAndValuesCount> - Report attribute value counts

18 L<reportBookMaps|/reportBookMaps> - Report on whether each bookmap is good or bad

19 L<reportConrefs|/reportConrefs> - Report bad conrefs refs

20 L<reportDocTypeCount|/reportDocTypeCount> - Report doc type count

21 L<reportDuplicateIds|/reportDuplicateIds> - Report duplicate ids

22 L<reportDuplicateTopicIds|/reportDuplicateTopicIds> - Report duplicate topic ids

23 L<reportExternalXrefs|/reportExternalXrefs> - Report external xrefs missing other attributes

24 L<reportFileExtensionCount|/reportFileExtensionCount> - Report file extension counts

25 L<reportFileTypes|/reportFileTypes> - Report file type counts - takes too long in series

26 L<reportFixRefs|/reportFixRefs> - Report of hrefs that need to be fixed

27 L<reportGuidHrefs|/reportGuidHrefs> - Report on guid hrefs

28 L<reportGuidsToFiles|/reportGuidsToFiles> - Map and report guids to files

29 L<reportHrefUrlEncoding|/reportHrefUrlEncoding> - href needs url encoding

30 L<reportImages|/reportImages> - Reports on images and references to images

31 L<reportLtGt|/reportLtGt> - Report items found between &lt; and &gt;

32 L<reportMaxZoomOut|/reportMaxZoomOut> - Text located via Max Zoom In

33 L<reportMd5Sum|/reportMd5Sum> - Good files have short names which uniquely represent their content and thus can be used instead of their md5sum to generate unique names

34 L<reportNoHrefs|/reportNoHrefs> - Report locations where an href was expected but not found

35 L<reportNotReferenced|/reportNotReferenced> - Report files not referenced by any of conref, image, topicref, xref and are not bookmaps.

36 L<reportOlBody|/reportOlBody> - ol under body - indicative of a task

37 L<reportParseFailed|/reportParseFailed> - Report failed parses

38 L<reportPossibleImprovements|/reportPossibleImprovements> - Report improvements possible

39 L<reportReferencesFromBookMaps|/reportReferencesFromBookMaps> - Topics and images referenced from bookmaps

40 L<reportRefs|/reportRefs> - Report bad references found in xrefs or conrefs as they have the same structure

41 L<reportSimilarTopicsByTitle|/reportSimilarTopicsByTitle> - Report topics likely to be similar on the basis of their titles as expressed in the non Guid part of their file names

42 L<reportSimilarTopicsByVocabulary|/reportSimilarTopicsByVocabulary> - Report topics likely to be similar on the basis of their vocabulary

43 L<reportTables|/reportTables> - Report on tables that have problems

44 L<reportTagCount|/reportTagCount> - Report tag counts

45 L<reportTopicDetails|/reportTopicDetails> - Things that occur once in each file

46 L<reportTopicRefs|/reportTopicRefs> - Report topic refs

47 L<reportTopicReuse|/reportTopicReuse> - Count how frequently each topic is reused

48 L<reportValidationErrors|/reportValidationErrors> - Report the files known to have validation errors

49 L<reportXml1|/reportXml1> - Report bad xml on line 1

50 L<reportXml2|/reportXml2> - Report bad xml on line 2

51 L<reportXrefs|/reportXrefs> - Report bad xrefs

52 L<xref|/xref> - Check the cross references in a set of Dita files held in  L<inputFolder|/inputFolder> and report the results in the L<reports|/reports> folder.

=head1 Installation

This module is written in 100% Pure Perl and, thus, it is easy to read,
comprehend, use, modify and install via B<cpan>:

  sudo cpan install Data::Edit::Xml::Xref

=head1 Author

L<philiprbrenan@gmail.com|mailto:philiprbrenan@gmail.com>

L<http://www.appaapps.com|http://www.appaapps.com>

=head1 Copyright

Copyright (c) 2016-2019 Philip R Brenan.

This module is free software. It may be used, redistributed and/or modified
under the same terms as Perl itself.

=cut



# Tests and documentation

sub test
 {my $p = __PACKAGE__;
  binmode($_, ":utf8") for *STDOUT, *STDERR;
  return if eval "eof(${p}::DATA)";
  my $s = eval "join('', <${p}::DATA>)";
  $@ and die $@;
  eval $s;
  $@ and die $@;
  1
 }

test unless caller;

1;
# podDocumentation
__DATA__
use Test::More;
use warnings FATAL=>qw(all);
use strict;

if ($^O =~ m(linux)i)
 {plan tests => 17;
 }
else
 {plan skip_all => 'Only Linux is supported';
 }

Test::More->builder->output("/dev/null")                                        # Show only errors during testing
  if ((caller(1))[0]//'Data::Edit::Xml::Xref') eq "Data::Edit::Xml::Xref";

if (1) {                                                                        # Fix xrets by title  - there should be just one so fixed
  clearFolder($_, 420) for qw(in out reports);
  createSampleInputFiles(8);

  my $x = xref(inputFolder     => q(in),
               fixXrefsByTitle =>1);

  ok nws($x->statusLine) eq nws(<<'END');
Xref: 50 bad xrefs, 18 missing image files, 18 missing image references, 14 bad first lines, 14 bad second lines, 12 duplicate topic ids, 11 bad conrefs, 9 files with bad conrefs, 9 files with bad xrefs, 8 duplicate ids, 6 bad topicrefs, 4 files not referenced, 4 invalid guid hrefs, 3 bad book maps, 2 bad tables, 2 href url encoding, 1 External xrefs with no format=html, 1 External xrefs with no scope=external, 1 file failed to parse, 1 href missing
END

  my $y = xref(inputFolder     => q(in));                                       # Update error counts

  ok nws($y->statusLine) eq nws(<<'END');
Xref: 49 bad xrefs, 18 missing image files, 18 missing image references, 14 bad first lines, 14 bad second lines, 12 duplicate topic ids, 11 bad conrefs, 9 files with bad conrefs, 9 files with bad xrefs, 8 duplicate ids, 6 bad topicrefs, 4 files not referenced, 4 invalid guid hrefs, 3 bad book maps, 2 bad tables, 2 href url encoding, 1 External xrefs with no format=html, 1 External xrefs with no scope=external, 1 file failed to parse, 1 href missing
END
 }

if (1)
 {clearFolder($_, 420) for qw(in out reports);
  createSampleInputFiles(8);

  my $x = xref(inputFolder              => q(in),
               maximumNumberOfProcesses => 2);

  ok nws($x->statusLine) eq nws(<<'END');
Xref: 50 bad xrefs, 18 missing image files, 18 missing image references, 14 bad first lines, 14 bad second lines, 12 duplicate topic ids, 11 bad conrefs, 9 files with bad conrefs, 9 files with bad xrefs, 8 duplicate ids, 6 bad topicrefs, 4 files not referenced, 4 invalid guid hrefs, 3 bad book maps, 2 bad tables, 2 href url encoding, 1 External xrefs with no format=html, 1 External xrefs with no scope=external, 1 file failed to parse, 1 href missing
END


  is_deeply stripInputFolderFromHashKeys2($x->topicsReferencedFromBookMaps,
                                          fpd($x->inputFolder)),
{
  "act2.dita"            => { "act1.dita" => 1, "act9999.dita" => 1 },
  "map/bookmap.ditamap"  => {
                               "act1.dita"     => 1,
                               "act2.dita"     => 1,
                               "map/9999.dita" => 1,
                               "map/bbb.txt"   => 1,
                               "map/r.txt"     => 1,
                               "map/yyyy.dita" => 1,
                             },
  "map/bookmap2.ditamap" => {
                               "act1.dita"     => 1,
                               "act2.dita"     => 1,
                               "map/9999.dita" => 1,
                               "map/bbb.txt"   => 1,
                               "map/r.txt"     => 1,
                               "map/zzzz.dita" => 1,
                             },
  "map/bookmap3.ditamap" => { "act3.dita" => 1, "act4.dita" => 1, "act5.dita" => 1 },
};

  is_deeply stripInputFolderFromHashKeys2($x->imagesReferencedFromBookMaps,
                                          fpd($x->inputFolder)),
{
  "act2.dita" => {
    "act1.png"  => 1,
    "act2.png"  => 1,
    "guid-000"  => 1,
    "guid-9999" => 1,
    "guid-act1" => 1,
  },
  "map/bookmap.ditamap" => {
    "act1.png"  => 1,
    "act2.png"  => 1,
    "guid-000"  => 1,
    "guid-9999" => 1,
    "guid-act1" => 1,
  },
  "map/bookmap2.ditamap" => {
    "act1.png"  => 1,
    "act2.png"  => 1,
    "guid-000"  => 1,
    "guid-9999" => 1,
    "guid-act1" => 1,
  },
};
 }

if (1)
 {clearFolder($_, 420) for qw(in out reports);
  createSampleInputFiles(8);

  my $x = xref(inputFolder              => q(in),
               deguidize                => 1,
               fixBadRefs               => 1,
               maximumNumberOfProcesses => 2,
               matchTopics              => 0.9,
               flattenFolder            => q(out),
               relativePath             => q(in));

  ok nws($x->statusLine) eq nws(<<'END');
Xref: 105 xtfr, 50 bad xrefs, 18 missing image files, 18 missing image references, 14 bad first lines, 14 bad second lines, 12 duplicate topic ids, 11 bad conrefs, 9 files with bad conrefs, 9 files with bad xrefs, 8 duplicate ids, 6 bad topicrefs, 4 invalid guid hrefs, 3 bad book maps, 3 files not referenced, 2 bad tables, 2 href url encoding, 1 External xrefs with no format=html, 1 External xrefs with no scope=external, 1 file failed to parse, 1 href missing
END
 }

if (1) {                                                                        #Txref
  clearFolder($_, 420) for qw(in out reports);
  createSampleInputFiles(8);
  my $x = xref(inputFolder              => q(in),
               deguidize                => 1,
               fixBadRefs               => 1,
               fixRelocatedRefs         => 1,
               maximumNumberOfProcesses => 2,
               matchTopics              => 0.9,
               flattenFolder            => q(out),
               relativePath             => q(in));

  ok nws($x->statusLine) eq nws(<<'END');
Xref: 103 xtfr, 50 bad xrefs, 18 missing image files, 18 missing image references, 14 bad first lines, 14 bad second lines, 12 duplicate topic ids, 11 bad conrefs, 9 files with bad conrefs, 9 files with bad xrefs, 8 duplicate ids, 6 bad topicrefs, 4 invalid guid hrefs, 3 bad book maps, 3 files not referenced, 2 bad tables, 2 href url encoding, 1 External xrefs with no format=html, 1 External xrefs with no scope=external, 1 file failed to parse, 1 href missing
END

  say STDERR $x->statusTable;


  $$_[2] = swapFilePrefix($$_[2], fpd($x->inputFolder))
   for @{$x->relocatedReferencesFixed};

  is_deeply $x->relocatedReferencesFixed,
[["map/bookmap.ditamap",
  "bookmap.ditamap",
  "act2.dita",
 ],
 ["map/bookmap2.ditamap",
  "bookmap2.ditamap",
  "act2.dita",
 ],
];
 }

if (1)                                                                          # Add nav titles
 {my $N = 8;

  clearFolder($_, 420) for qw(in out reports);
  createSampleInputFiles($N);

  my $x = xref(inputFolder              => q(in),
               addNavTitles             => 1);

  is_deeply [map {[@$_[0..1]]} @{$x->badNavTitles }],
 [["No title for target", "chapter href=\"../act3.dita\""],
  ["No title for target", "chapter href=\"../act4.dita\""],
  ["No title for target", "chapter href=\"../act5.dita\""],
  ["No title for target", "topicref href=\"../map/r.txt\""],
  ["No title for target", "topicref href=\"9999.dita\""],
  ["No title for target", "topicref href=\"bbb.txt\""],
  ["No file for guid", "topicref href=\"guid-888\""],
  ["No file for guid", "topicref href=\"guid-999\""],
  ["No title for target", "chapter href=\"yyyy.dita\""],
  ["No title for target", "topicref href=\"../map/r.txt\""],
  ["No title for target", "topicref href=\"9999.dita\""],
  ["No title for target", "topicref href=\"bbb.txt\""],
  ["No file for guid", "topicref href=\"guid-888\""],
  ["No file for guid", "topicref href=\"guid-999\""],
  ["No title for target", "chapter href=\"zzzz.dita\""],
 ];

  is_deeply [map {[@$_[0..1]]} @{$x->goodNavTitles}],
 [["../act1.dita", "All Timing Codes Begin Here"],
  ["../act2.dita", "Jumping Through Hops"],
  ["guid-000", "All Timing Codes Begin Here"],
  ["../act1.dita", "All Timing Codes Begin Here"],
  ["../act2.dita", "Jumping Through Hops"],
  ["guid-000", "All Timing Codes Begin Here"],
 ];

  ok index(readFile(q(reports/count/attributeNamesAndValues.txt)), <<END) > 0;
Summary_of_column_Attribute
   Count  Attribute
1    100  href
2     77  id
3     20  conref
4      1  cols
5      1  format
END
 }

if (1)                                                                          # Max zoom in
 {my $N = 8;

  clearFolder($_, 420) for qw(in out reports);
  createSampleInputFiles($N);

  my $x = xref(inputFolder => q(in),
               maxZoomIn   => {bad=>q(Bad), good=>q(Good)});

  is_deeply stripInputFolderFromHashKeys($x->maxZoomOut,
                                           fpd($x->inputFolder)),
{
    "1.dita"               => { data => { bad => 3, good => 4 }, title => "Concept 1 refers to 2" },
    "2.dita"               => { data => { bad => 3, good => 4 }, title => "Concept 2 refers to 3" },
    "3.dita"               => { data => { bad => 3, good => 4 }, title => "Concept 3 refers to 4" },
    "4.dita"               => { data => { bad => 3, good => 4 }, title => "Concept 4 refers to 5" },
    "5.dita"               => { data => { bad => 3, good => 4 }, title => "Concept 5 refers to 6" },
    "6.dita"               => { data => { bad => 3, good => 4 }, title => "Concept 6 refers to 7" },
    "7.dita"               => { data => { bad => 3, good => 4 }, title => "Concept 7 refers to 8" },
    "8.dita"               => { data => { bad => 3, good => 4 }, title => "Concept 8 refers to 1" },
    "act1.dita"            => { data => {}, title => "All Timing Codes Begin Here" },
    "act2.dita"            => { data => {}, title => "Jumping Through Hops" },
    "act4.dita"            => { data => {}, title => undef },
    "act5.dita"            => { data => {}, title => undef },
    "map/bookmap.ditamap"  => { data => {}, title => "Test" },
    "map/bookmap2.ditamap" => { data => {}, title => "Test 2" },
    "map/bookmap3.ditamap" => { data => {}, title => "Test 3" },
    "table.dita"           => { data => {}, title => "Tables" },
  };
 }

if (1)                                                                          # fixFolder
 {clearFolder($_, 1e3) for qw(in out outFixed reports);
  createSampleInputFilesFixFolder(q(in));

  my $x = xref(inputFolder => q(in),
               fixBadRefs  => 1,
               fixedFolder => q(outFixed));

  ok $x->statusLine eq q(Xref: 2 bad second lines, 2 xtfr, 1 bad conref, 1 bad xref, 1 file not referenced, 1 file with bad conrefs, 1 file with bad xrefs);

  my @files = searchDirectoryTreesForMatchingFiles(qw(outFixed dita));

  ok @files == 1;
  ok nws(readFile($files[0])) eq nws(<<END);
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE reference PUBLIC "-//PHIL//DTD DITA Task//EN" "concept.dtd" []>
<concept id="c1">
  <title>Concept 1 which refers to concept 2</title>
  <conbody>
    <p conref="2.dita#c2/p1"/>
    <p conref="2.dita#c2/p2"/>
    <p xtrf="3.dita#c2/p1"/>
    <xref href="2.dita#c2/p1"/>
    <xref href="2.dita#c2/p2"/>
    <xref xtrf="3.dita#c2/p1"/>
  </conbody>
</concept>
END
 }

if (1)                                                                          # ltgt
 {clearFolder($_, 1e3) for qw(in reports);
  createSampleInputFilesLtGt(q(in));

  my $x = xref(inputFolder => q(in));
  my $r = readFile(q(reports/count/ltgt.txt));
  ok $r =~ m(1\s*1\s*aaa);
  ok $r =~ m(2\s*1\s*bbb);
 }

1
