proc txtDiff {t1 t2} {

#	problem:  how similar or different are texts 1 and 2?
#		  if we assume that "similar" means similar in
#		  content, rather than in style, we can almost
#		  answer this question by a crude and brutal
#		  vocab analysis.

#	first we have to reduce them to lists of words.
#	no punctuation.

	regsub -all \\\( $t1 "" t1
	regsub -all \\\) $t1 "" t1
	regsub -all \\\[ $t1 "" t1
	regsub -all \\\] $t1 "" t1
	regsub -all \\\. $t1 "" t1
	regsub -all , $t1 "" t1
	regsub -all ' $t1 "" t1
	regsub -all \\\: $t1 "" t1
	regsub -all \\\; $t1 "" t1
	regsub -all -- - $t1 " " t1

	regsub -all \\\( $t2 "" t2
	regsub -all \\\) $t2 "" t2
	regsub -all \\\[ $t2 "" t2
	regsub -all \\\] $t2 "" t2
	regsub -all \\\. $t2 "" t2
	regsub -all , $t2 "" t2
	regsub -all ' $t2 "" t2
	regsub -all \\\: $t2 "" t2
	regsub -all \\\; $t2 "" t2
	regsub -all -- - $t2 " " t2

	set t1 [string toupper $t1]
	set t2 [string toupper $t2]

	set t1 [lrmdups $t1]
	set t2 [lrmdups $t2]
	
	lassign [intersect3 $t1 $t2] w1 common w2

#	now, what to do with the result?  first, we
#	have to eliminate common words like

	set junk [list A AN THE IT OF FOR TO FROM BY WITH WITHOUT AND OR WHICH ANY DO DOES HAVE HAS HAD IS WAS BE ARE WERE NOT TRUE FALSE OFF ON WHEN SET]

	foreach l [list w1 common w2] {

		set fix ""

		foreach w [set $l] {
			if [lcontain $junk $w] {continue}
			if {[crange $w end-2 end] == "ING"} {
				set w [crange $w 0 end-3]
			} elseif {[crange $w end-1 end] == "ED"} {
				set w [crange $w 0 end-2]
			} elseif {[crange $w end end] == "S"} {
				set w [crange $w 0 end-1]
			}
			lappend fix $w
		}

		set $l $fix
	}

#	having eliminated the dross, what do we want to know?
#	what percentage of A and B is common?

	set tl [expr [llength $w1] + [llength $common] + [llength $w2]]

	keylset detl w1 $w1 common $common w2 $w2
	keylset nums l1 [llength $w1] lc [llength $common] l2 [llength $w2]

#	well that's clearly not the right number!  it's not whether
#	COMMON is a greater or lesser percentage of the total so much
#	as... as what?  

#	a bigger number is more similar.   a smaller number is less similar.
	set cp [expr [llength $common]/double($tl)]

	return [list $cp $nums $detl]

}
