User:Neoconned/SourceWatchRefConverter

From SourceWatch
Jump to navigation Jump to search

This has now been mothballed in favour of the SourceWatch:RefConverterBot project.

  • This is a temporary home for the code. When it's a bit more finished, it will be integrated into MediaWiki:Monobook.js.
  • This code is adapted and developed from Cyde Weys's ref converter: http://en.wikipedia.org/wiki/User:Cyde/Ref_converter. There are two purposes to the rewrite:
    • Handle "traditional" SourceWatch style referencing. This consists of a plain numbered link in the body of the article, e.g. [1] and a corresponding citation in the External Links section. The vast majority of SW articles still use this referencing style. Very few SW articles use the note/ref templates (which are what Cyde Weys's converter will convert).
    • Run in JavaScript rather than Perl. You'll therefore be able to run the converter from the Edit page when you edit an article.
  • The rewrite will take a while. Don't expect results soon.

The code

// This program converts (on MediaWiki wikis):
// *{{note}} and {{ref}} to <references/> style.
// *Traditional SourceWatch style references to <references/> style.
//
// Copyright (C) 2006 Ben "Cyde Weys" McIlwain
// Copyright (C) 2007 Neoconned (http://www.sourcewatch.org/index.php?title=User:Neoconned)
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
//
//
//

function swrcMain(fullText)
{
    //This accumulates the number of possible things that were incorrect with {{ref}}/{{note}}.
    var numErrors = 0;
    
	// This keeps track of the initial length of the article
        // before we make any changes to it.
	var preLength = fullText.length;

	//These two variables accumulate lines of text and are
        //output at the end.
	var warnings = new String();
	var verbosage = new String();

        // Neoconned: Not really sure what this is about...
	//Get rid of the "How to add a footnote" comment that this script makes superfluous.
	//if ($fullText =~ m/\<\!\-\-[^\n]*add[^\n]*footnote.*?\-\-\>/s) {
	//    $fullText =~ s/\<\!\-\-[^\n]*add[^\n]*footnote.*?\-\-\>//s;
	//    $warnings .= "Deleting comment on how to add old footnotes, make sure this was done correctly.\n";
	//}

	//{{mnb2}} is incredibly broken
        if (fullText.indexOf('mnb2')!=-1)
        {
	    alert("Panic, detecting {{mnb2}}, this article is most likely broken and will need manual repair.");
            return;
	}

	//This goes through the article source looking for citation
        //templates that are over one line.  This is
	//necessary because the citation templates must be inserted into
        //the article text inline or things will break.
	//This has the side-effect of changing citation templates that
        //aren't part of notes.  Oh well.
	//Then we need to detect if any changes have been made, and if
        //they have, print a warning message to that effect.
	my $tempText = $fullText;
	$fullText =~ s/(\{\{cite [^\{\}]*?\}\})/my$x=$1;$x=~s{\n}{}g; $x/egs;
	if ($tempText ne $fullText) {
	    $warnings .= "Detecting multiple line cite, trying to fix, make sure I don't make any mistakes.\n";
	}

	//Get a list of all matches of {{ref|...}} and {{ref label|...}}
        //and {{ref harv|...}} and {{ref harvard|...}}

	my @matches = ($fullText =~ m/\{\{(?:mn|ref(?:[_ ]label|[_ ]harv|[_ ]harvard|[_ ]num)?)\s*\|\s*([^\|]*?)\s*(?:\|\s*[^\|\}]*?\s*)*?\}\}/gi);
	//push @matches, ($fullText =~ m/\{\{mn\s*\|\s*([^\|]*?)\s*\|\s*[^\|\}]*?\s*\}\}/gi);

	// If there are no {{ref}}s in the article then there's
        // no point in continuing.
	if ($//matches > -1)
        {
            swrcArticleContainsSomeRefs();
        }
}

function swrcArticleContainsSomeRefs()
{
	    ////// This next little section creates @matchesSingle, which
            // consists of @matches minus
	    // any duplicate entries, and @matchesMult, which
            // consists of a list of single entries
	    // of things that did have duplicate entries.  It also
            // removes duplicate entries from @matches.
	    // In other words, if @matches was [a,a,b,c,d,d,e], then:
	    // @matches = [a,b,c,d,e]
	    // @matchesSingle = [b,c,e]
	    // @matchesMult = [a,d]

	    my %tempHash;
	    my %multHash;
	    foreach (@matches) {
		//Note: lc turns all the characters of a string into
                //their lowercase counterparts._
		if (exists $tempHash{lc($_)}) {
		    $multHash{lc($_)} = lc($_);
		}
		else {
		    $tempHash{lc($_)} = lc($_);
		}
	    }
	    @matches = sort values %tempHash;
	    my @matchesMult = sort values %multHash;

	    //Subtract set @matchesMult from set @matchesSingle
	    foreach (@matchesMult) {
		delete $tempHash{$_};
	    }
	    my @matchesSingle = sort values %tempHash;

	    //
	    // End complicated section.
	    //////

	    if ($//matchesMult >= 0) {
		$warnings .= "Detecting multiple refs with the same name, make sure I handle this correctly.\n";
	    }

	    //refCoors is the hash between ref name and note text.
	    my %refCorrs = ();
	    my $finalText = "";
	
	    my $firstMatch = 1;
	    my $matched = 0;

            swrcFindTheNotes(fullText);

	    my $currMatch = "";

	    //Go through and replace references that were only referenced once with a simple <ref>.
	    foreach $currMatch (@matchesSingle) {
		if (exists $refCorrs{$currMatch} && $refCorrs{$currMatch} !~ m/^\s*$/) {
		    if ($forceNames eq 'on') {
			my $refName = $currMatch;
			if ($refName =~ m/^\d+$/) {
			    $refName = 'ref' . $refName;
			}
			$finalText =~ s/\{\{(?:mn|ref(?:[_ ]label|[_ ]harv|[_ ]harvard|[_ ]num)?)\s*\|\s*\Q$currMatch\E\s*(?:\|[^\|\}]*?\s*)*?\}\}/\<ref name=\"$refName\"\>$refCorrs{$currMatch}\<\/ref\>/gi;
		    }
		    else {
			$finalText =~ s/\{\{(?:mn|ref(?:[_ ]label|[_ ]harv|[_ ]harvard|[_ ]num)?)\s*\|\s*\Q$currMatch\E\s*(?:\|[^\|\}]*?\s*)*?\}\}/\<ref\>$refCorrs{$currMatch}\<\/ref\>/gi;
		    }
		    $verbosage .= "Replacing ref \"$currMatch\" with full note: \<ref\>$refCorrs{$currMatch}\<\/ref\>\n";
		}
		elsif (exists $refCorrs{$currMatch} && $currMatch =~ m/^\s*$/) {
		    //Deal with blank notes.  We don't want to be inserting <ref></ref> into the article.
		    $numErrors++;
		    $warnings .= "Found a blank note, ref is \"$currMatch\"\n";
		}
		else {
		    $numErrors++;
		    $warnings .= "Ref \"$currMatch\" doesn\'t exist in notes.  Turning into \{\{citation needed\}\}\n";
		}
	    }

	    //Now we need to go through and replace references that were referenced multiple times.
	    //We need to name our references now.
	    foreach $currMatch (@matchesMult) {
		if (exists $refCorrs{$currMatch} && $refCorrs{$currMatch} !~ m/^\s*$/) {
		    //Cite.php returns an error if the refName is an integer value, so we'll pad it out with a character.
		    my $refName = $currMatch;
		    if ($refName =~ m/^\d+$/) {
			$refName = 'ref' . $refName;
		    }
		    $finalText =~ s/\{\{(?:mn|ref(?:[_ ]label|[_ ]harv|[_ ]harvard|[_ ]num)?)\s*\|\s*\Q$currMatch\E\s*(?:\|[^\|\}]*?\s*)*?\}\}/\<ref name=\"$refName\"\>$refCorrs{$currMatch}\<\/ref\>/i;
		    $finalText =~ s/\{\{(?:mn|ref(?:[_ ]label|[_ ]harv|[_ ]harvard|[_ ]num)?)\s*\|\s*\Q$currMatch\E\s*(?:\|[^\|\}]*?\s*)*?\}\}/\<ref name=\"$refName\" \/\>/gi;
		    $verbosage .= "Replacing multiply referenced \"$refName\" with full notes: \<ref\>$refCorrs{$currMatch}\<\/ref\>\n";
		}
		elsif (exists $refCorrs{$currMatch} && $currMatch =~ m/^\s*$/) {
		    //Deal with blank notes.  We don't want to be inserting <ref></ref> into the article.
		    $numErrors++;
		    $warnings .= "Found a blank multiply referenced note, ref is \"$currMatch\"\n";
		}
		else {
		    $numErrors++;
		    $warnings .= "Multiple reference \"$currMatch\" doesn\'t exist in notes.  Turning into \{\{citation needed\}\}\n";
		}
	    }
	
	    //One more loop through any remaining {{ref}} tags to turn them into {{citation needed}}.
	    $finalText =~ s/\{\{(?:mn|ref(?:[_ ]label|[_ ]harv|[_ ]harvard|[_ ]num)?)\s*\|\s*[^\|]*?\s*(?:\|[^\|\}]*?\s*)*?\}\}/\{\{citation needed\}\}/gi;

	    //Remove excess spaces that we may have just made by deleting the content inbetween.
	    if ($finalText =~ m/\n{4,}/gs) {
		$warnings .= "I think I have found too many consecutive newlines, I am going to remove them, make sure I did this right.\n";
		$finalText =~ s/\n{4,}/\n\n/gs;
	    }

	    //Final sanity checks
	    if ($finalText =~ m/\{\{ref/gi) {
		$warnings .= "Failing sanity check, there may still be some {{ref}}s left.\n";
	    }
	    if ($finalText =~ m/\{\{note/gi) {
		$warnings .= "Failing sanity check, there may still be some {{note}}s left.\n";
	    }
	    if ($finalText =~ m/\{\{mn/gi) {
		$warnings .= "Failing sanity check, there may still be some Footnote4 stuff left ({{mn}} or {{mnb}}).\n";
	    }

	    print '<b>Finished</b>.<br>' . "\n";

	}
	else {
	}
}

/*

swrcFindTheNotes function

*/

//Split the full Wiki source into discrete lines and
//process them sequentially to see if
//each line contains a {{note}} or a {{note label}}.
//If the line does contain a {{note}},
//match it up in the hash with its appropriate
//ref.  If it doesn't match, throw a warning
//and comment it out.  If it did match, remove it, and
//replace all removed {{note}}s with a single <references/>
function swrcFindTheNotes(fullText)
{
	var fullTextLines=fullText.split("\n");

	for (i=0; i<fullTextLines.length; i++)
	{
		var thisLine = fullTextLines[i];
		var matched = 0;

		//Loop through each of the ref names to see if it matches
		//with any notes on this line.  This has O(n*m) efficiency.
		foreach (@matches)
		{
		    if ($thisLine =~ m/\{\{(?:mnb2?|note(?:[_ ]label)?)\s*\|\s*\Q$_\E\s*(?:\|\s*[^\{\}]*?\s*)*\}\}\s*(.*)$/i)
			{
				my $thisMatch = $1;
				if ($thisMatch =~ m/(\{\{note[_ ]label[^\}\{]*?\}\})/i)
				{
					$thisMatch =~ s/\{\{note[_ ]label\s*[^\}\{]*?\}\}//gi;
				}

				//Chop off leading and trailing spaces.
				$thisMatch =~ s/^\s+//;
				$thisMatch =~ s/\s+$//;
				$verbosage .= "Matching up ref \"$_\", removing from list, note is: $thisMatch\n";
				$refCorrs{$_} = $thisMatch;
				$matched = 1;
	
				//firstMatch is used to keep track of the first note
				//that has been replaced.  The first note is replaced
				//with <references /> and the rest are just deleted.
				if ($firstMatch == 1)
				{
					if ($fullText !~ m/\<references(\s*\/)?\>/g)
					{
						if ($smallFont eq "on")
						{
							$finalText .= '<div class="references-small"><references /></div>' . "\n";
						}
						else
						{
							$finalText .= "<references />\n";
						}
					}
					$firstMatch = 0;
				}
		    }
		}

		//If this line had a note with no corresponding ref, comment
		//it out and print a warning message.
		if ($matched == 0)
		{
		    if ($thisLine =~ m/\{\{(?:mnb2?|note)\s*\|\s*([^\|]*?)\s*\|?\s*\}\}\s*(.*)$/i)
			{
				$warnings .= "Note \"$1\" isn\'t referenced, commenting out, link was: $2\n";
				$numErrors++;
				$finalText .= "<!-- Dead note \"$1\": $2 -->\n";
		    }
		    else
			{
				$finalText = $finalText . $thisLine . "\n";
		    }
		}
    }
}