Put the misc. stuff in the top dir.
This commit is contained in:
		
							
								
								
									
										377
									
								
								site/ordfinder/index.org
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										377
									
								
								site/ordfinder/index.org
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,377 @@
 | 
			
		||||
#+title: Old junk code: Word finder
 | 
			
		||||
#+summary: Less than perfect C code 
 | 
			
		||||
#+license: wtfpl, unless otherwise noted
 | 
			
		||||
#+startup: showall
 | 
			
		||||
#&toc
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
* Old junk code: Word finder
 | 
			
		||||
 | 
			
		||||
#+caption: Based on [[https://commons.wikimedia.org/wiki/File:2001-91-1_Computer,_Laptop,_Pentagon_(5891422370).jpg][this]], CC BY 2.0
 | 
			
		||||
#&img;url=sadcomputer.png, float=right
 | 
			
		||||
 | 
			
		||||
If you ever get tired of looking at your own junk code, take a look at this.
 | 
			
		||||
 | 
			
		||||
In August 2008, when I was still learning to program in C, I created a program
 | 
			
		||||
"ordfinder" (eng: word finder) which, given a word and a dictionary, prints the
 | 
			
		||||
words from the dictionary which can be created from the letters from the given
 | 
			
		||||
word in any order. Incredibly, it ended up compiling and works perfectly for any
 | 
			
		||||
word whose length does not exceed 8 characters, although it is a bit slow.
 | 
			
		||||
 | 
			
		||||
But why not more than 8 characters? My view of memory might have been a bit
 | 
			
		||||
naive back then, because the first step in my algorithm is to generate and
 | 
			
		||||
store all permutations of all subsequences of the given word. That is, if the
 | 
			
		||||
string is "me", my program stores the array ={ "m", "e", "me", "em" }= in
 | 
			
		||||
memory before going on to reading the dictionary and looking for matches.
 | 
			
		||||
 | 
			
		||||
If the string is "you", the program stores ={ "y", "o", "yo", "oy", "u", "yu",
 | 
			
		||||
"uy", "ou", "uo", "you", "yuo", "oyu", "ouy", "uyo", "uoy" }=.
 | 
			
		||||
 | 
			
		||||
If the string is "computer", the program stores the 109600 permutations of the
 | 
			
		||||
subsequences of "computer".
 | 
			
		||||
 | 
			
		||||
If the string is "difficult", the length of 9 characters means that the program
 | 
			
		||||
attempts to store 986409 strings of lengths 1 to 9. That probably takes up not
 | 
			
		||||
more than 10 MB, so it shouldn't be a problem. However, my program seems to
 | 
			
		||||
store the list of words on the stack instead of in memory, so words with length
 | 
			
		||||
9 or above cause a stack overflow to happen.
 | 
			
		||||
 | 
			
		||||
In any case, a word length of 10 would require about 100 MB, a word length of 11
 | 
			
		||||
about 1.2 GB, a word length of 12 about 15.6 GB, and a word length of 17 (like
 | 
			
		||||
"inconspicuousness") about 16,5 Petabytes (16500000 GB). That's 6,5 Petabytes
 | 
			
		||||
*more* than [[http://archive.org/web/petabox.php][what the Internet Archive uses]] to store millions of websites, books,
 | 
			
		||||
video and audio.
 | 
			
		||||
 | 
			
		||||
So perhaps neither my algorithm nor my implementation was that good.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
* The code
 | 
			
		||||
 | 
			
		||||
Note that this code doesn't actually compile, because of all the wrong
 | 
			
		||||
code. However, it did compile back in 2008 which means that either I added the
 | 
			
		||||
wrong code after I had compiled it, or I used an overfriendly compiler (I don't
 | 
			
		||||
remember which compiler it was, but it ran on Windows). I have run the old
 | 
			
		||||
executable with ~wine~, and that works.
 | 
			
		||||
 | 
			
		||||
It's not necesarry to know C to laugh at this code, but it helps.
 | 
			
		||||
 | 
			
		||||
We'll start with some basic ~#include~s.
 | 
			
		||||
 | 
			
		||||
#+BEGIN_SRC c
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
#include <ctype.h>
 | 
			
		||||
#include <math.h>
 | 
			
		||||
#+END_SRC
 | 
			
		||||
 | 
			
		||||
So far, so good. Then the global variables with descriptive names. And let's
 | 
			
		||||
declare four strings of length 0 to be statically allocated, because we'll just
 | 
			
		||||
extend them later on...?
 | 
			
		||||
 | 
			
		||||
#+BEGIN_SRC c
 | 
			
		||||
char os[0],s[0],r[0],t[0];
 | 
			
		||||
int l,c,rc,k,sk,i,ii,iii,ri;
 | 
			
		||||
#+END_SRC
 | 
			
		||||
 | 
			
		||||
The next step is to define our own version of C's builtin ~strstr~ function
 | 
			
		||||
(almost). I was used to PHP, so I wanted the same return values as PHP's
 | 
			
		||||
~strpos~.
 | 
			
		||||
 | 
			
		||||
#+BEGIN_SRC c
 | 
			
		||||
int strpos (const char *haystack, const char *needle) {
 | 
			
		||||
  int i;
 | 
			
		||||
 | 
			
		||||
  if (strlen (haystack) < strlen (needle))
 | 
			
		||||
    return -1;
 | 
			
		||||
 | 
			
		||||
  for (i = 0; i <= (strlen (haystack) - strlen(needle)); i++) {
 | 
			
		||||
    if (!strncmp (&haystack[i], needle, strlen(needle)))
 | 
			
		||||
      return i;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  return -1;
 | 
			
		||||
}
 | 
			
		||||
#+END_SRC
 | 
			
		||||
 | 
			
		||||
Then it's time for the main function. We don't want to separate it into
 | 
			
		||||
auxiliary functions, because that's just ugly!
 | 
			
		||||
 | 
			
		||||
Indentation? Too much wastes too much space.
 | 
			
		||||
 | 
			
		||||
#+BEGIN_SRC c
 | 
			
		||||
int main(int argc, char *argv[])
 | 
			
		||||
{
 | 
			
		||||
 if (argc>1) {
 | 
			
		||||
 strcpy(os,argv[1]);
 | 
			
		||||
 }
 | 
			
		||||
 else {
 | 
			
		||||
 printf("Indtast ord: ");
 | 
			
		||||
 gets(os);
 | 
			
		||||
 }
 | 
			
		||||
 printf("T\x91nker...\n");
 | 
			
		||||
 strcpy(s,os);
 | 
			
		||||
 for(i=0;s[i];i++) {
 | 
			
		||||
 s[i]=tolower(s[i]);
 | 
			
		||||
 }
 | 
			
		||||
#+END_SRC
 | 
			
		||||
 | 
			
		||||
Wait, what? We use ~strcpy~ to copy the string ~argv[1]~, which contains the
 | 
			
		||||
word we want to permute, into the statically allocated ~os~ with length 0? Or we
 | 
			
		||||
read a line from standard in and save in ~os~? And almost the same for ~s~?
 | 
			
		||||
That's... not good.
 | 
			
		||||
 | 
			
		||||
At least these two lines aren't that bad.
 | 
			
		||||
 | 
			
		||||
#+BEGIN_SRC c
 | 
			
		||||
 l=strlen(s);
 | 
			
		||||
 c=pow(l,l);
 | 
			
		||||
#+END_SRC
 | 
			
		||||
 | 
			
		||||
But then begins the actual permutation generation logic. I have tried to
 | 
			
		||||
re-understand it, with no success.
 | 
			
		||||
 | 
			
		||||
#+BEGIN_SRC c
 | 
			
		||||
 rc=1;
 | 
			
		||||
 i=0;
 | 
			
		||||
 while (i<l-1) {
 | 
			
		||||
 rc=rc*(l-i);
 | 
			
		||||
 i++;
 | 
			
		||||
 }
 | 
			
		||||
#+END_SRC
 | 
			
		||||
 | 
			
		||||
While we're at it, why not declare two to-be-statically-allocated arrays with
 | 
			
		||||
dynamically-generated ints as lengths?
 | 
			
		||||
 | 
			
		||||
#+BEGIN_SRC c
 | 
			
		||||
 int ca[l];
 | 
			
		||||
 char ra[rc][l+1];
 | 
			
		||||
#+END_SRC
 | 
			
		||||
 | 
			
		||||
And then some more assignments and ~while~ loops...
 | 
			
		||||
 | 
			
		||||
#+BEGIN_SRC c
 | 
			
		||||
 ri=0;
 | 
			
		||||
 i=0;
 | 
			
		||||
 while (i<c) {
 | 
			
		||||
 k=1;
 | 
			
		||||
 ii=0;
 | 
			
		||||
 while (ii<l && k==1) {
 | 
			
		||||
#+END_SRC
 | 
			
		||||
 | 
			
		||||
This formula does something. I'm not sure what.
 | 
			
		||||
 | 
			
		||||
#+BEGIN_SRC c
 | 
			
		||||
 ca[ii]=floor(i/pow(l,l-ii-1))-floor(i/pow(l,l-ii))*l;
 | 
			
		||||
#+END_SRC
 | 
			
		||||
 | 
			
		||||
More ~while~ loops, now also with ~if~ statements.
 | 
			
		||||
 | 
			
		||||
#+BEGIN_SRC c
 | 
			
		||||
 iii=0;
 | 
			
		||||
 while (iii<ii) {
 | 
			
		||||
 if (ca[ii]==ca[iii]) {k=0;}
 | 
			
		||||
 iii++;
 | 
			
		||||
 }
 | 
			
		||||
 ii++;
 | 
			
		||||
 }
 | 
			
		||||
 if (k==1) {
 | 
			
		||||
 strcpy(ra[ri],"");
 | 
			
		||||
 ii=0;
 | 
			
		||||
 while (ii<l) {
 | 
			
		||||
 strncpy(t,s+ca[ii],1);
 | 
			
		||||
#+END_SRC
 | 
			
		||||
 | 
			
		||||
Let's concatenate ~t~ onto ~ra[ri]~, a string which hardly exists due to the
 | 
			
		||||
~char ra[rc][l+1];~ magic above.
 | 
			
		||||
 | 
			
		||||
#+BEGIN_SRC c
 | 
			
		||||
 strcat(ra[ri],t);
 | 
			
		||||
 ii++;
 | 
			
		||||
 }
 | 
			
		||||
#+END_SRC
 | 
			
		||||
 | 
			
		||||
And why not concatenate an end-of-string mark onto a string which, if it
 | 
			
		||||
doesn't have an end-of-string mark, will make ~strcat~ fail miserably?
 | 
			
		||||
 | 
			
		||||
#+BEGIN_SRC c
 | 
			
		||||
 strcat(ra[ri],"\0");
 | 
			
		||||
#+END_SRC
 | 
			
		||||
 | 
			
		||||
And then more junk.
 | 
			
		||||
 | 
			
		||||
#+BEGIN_SRC c
 | 
			
		||||
 sk=1;
 | 
			
		||||
 ii=0;
 | 
			
		||||
 while (ii<ri && sk==1) {
 | 
			
		||||
 if (strcmp(ra[ri],ra[ii])==0) {sk=0;}
 | 
			
		||||
 ii++;
 | 
			
		||||
 }
 | 
			
		||||
 if (sk==1) {
 | 
			
		||||
 //printf("%s\n",ra[ri]);
 | 
			
		||||
 ri++;
 | 
			
		||||
 }
 | 
			
		||||
 }
 | 
			
		||||
 i++;
 | 
			
		||||
 }
 | 
			
		||||
 //printf("\nOrd: %s\nOrdl\x91ngde: %d\nOrdkombinationer: %d\n",os,l,ri);
 | 
			
		||||
#+END_SRC
 | 
			
		||||
 | 
			
		||||
Phew... At this point, I'm certain that ~ra~ is supposed to be an array of all
 | 
			
		||||
word permutations. So let's open our dictionary "ord.txt" and look for matches.
 | 
			
		||||
 | 
			
		||||
#+BEGIN_SRC c
 | 
			
		||||
 FILE *f;
 | 
			
		||||
 char wrd[128];
 | 
			
		||||
 if (f=fopen("ord.txt","r")) {
 | 
			
		||||
 FILE *fw;
 | 
			
		||||
#+END_SRC
 | 
			
		||||
 | 
			
		||||
Everything is written both to output.txt *and* standard out. Anything else would
 | 
			
		||||
be stupid.
 | 
			
		||||
 | 
			
		||||
#+BEGIN_SRC c
 | 
			
		||||
 fw=fopen("output.txt","w");
 | 
			
		||||
 printf("Ord dannet af \"%s\":\n\n",os);
 | 
			
		||||
 fprintf(fw,"Ord dannet af \"%s\":\n\n",os);
 | 
			
		||||
 int wc=0;
 | 
			
		||||
 while(!feof(f)) {
 | 
			
		||||
 if(fgets(wrd,126,f)) {
 | 
			
		||||
#+END_SRC
 | 
			
		||||
 | 
			
		||||
The words each end with a newline, so let's replace the newline with an
 | 
			
		||||
end-of-string mark.
 | 
			
		||||
 | 
			
		||||
#+BEGIN_SRC c
 | 
			
		||||
 wrd[strlen(wrd)-1]=0;
 | 
			
		||||
 //printf("%s\n",wrd);
 | 
			
		||||
 k=0;
 | 
			
		||||
 ii=0;
 | 
			
		||||
 while (ii<ri && k==0) {
 | 
			
		||||
#+END_SRC
 | 
			
		||||
 | 
			
		||||
The magical core of the matching logic, using our own ~strpos~:
 | 
			
		||||
 | 
			
		||||
#+BEGIN_SRC c
 | 
			
		||||
 if (strpos(ra[ii],wrd)>-1) {k=1;}
 | 
			
		||||
#+END_SRC
 | 
			
		||||
 | 
			
		||||
If ~k == 1~, something good happens. But it doesn't happen at once for some
 | 
			
		||||
reason.
 | 
			
		||||
 | 
			
		||||
#+BEGIN_SRC c
 | 
			
		||||
 ii++;
 | 
			
		||||
 }
 | 
			
		||||
 if (k==1) {
 | 
			
		||||
 printf("%s\n",wrd);
 | 
			
		||||
 fprintf(fw,"%s\n",wrd);
 | 
			
		||||
 wc++;
 | 
			
		||||
 }
 | 
			
		||||
 }
 | 
			
		||||
 }
 | 
			
		||||
 printf("\nI alt %d ord\n",wc);
 | 
			
		||||
 fprintf(fw,"\nI alt %d ord",wc);
 | 
			
		||||
 fclose(fw);
 | 
			
		||||
 fclose(f);
 | 
			
		||||
 system("output.txt");
 | 
			
		||||
 }
 | 
			
		||||
 return 0;
 | 
			
		||||
}
 | 
			
		||||
#+END_SRC
 | 
			
		||||
 | 
			
		||||
And that's my pretty C code.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
* The SML equivalent
 | 
			
		||||
 | 
			
		||||
To make my inefficient algorithm a bit clearer, I have made a few SML functions
 | 
			
		||||
to do the same as above:
 | 
			
		||||
 | 
			
		||||
#+BEGIN_SRC ocaml
 | 
			
		||||
open List
 | 
			
		||||
 | 
			
		||||
(* Removes an element from a list. *)
 | 
			
		||||
fun remove x (y :: ys) = if x = y
 | 
			
		||||
                         then ys
 | 
			
		||||
                         else y :: remove x ys
 | 
			
		||||
 | 
			
		||||
(* Tails of a list. Stolen from Haskell's Data.List. *)
 | 
			
		||||
fun tails [] = [[]]
 | 
			
		||||
  | tails (xxs as (_ :: xs)) = xxs :: tails xs
 | 
			
		||||
 | 
			
		||||
(* Non-empty subsequences of a list. Stolen from Haskell's Data.List. *)
 | 
			
		||||
fun nonEmptySubsequences [] = []
 | 
			
		||||
  | nonEmptySubsequences (x :: xs) =
 | 
			
		||||
    let
 | 
			
		||||
        fun f (ys, r) = ys :: (x :: ys) :: r
 | 
			
		||||
    in 
 | 
			
		||||
        [x] :: foldr f [] (nonEmptySubsequences xs)
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
(* All permutations of a list. *)
 | 
			
		||||
fun permutations [] = [[]]
 | 
			
		||||
  | permutations xs =
 | 
			
		||||
    let
 | 
			
		||||
        fun subPermutations x = map (fn ys => x :: ys) (permutations (remove x xs))
 | 
			
		||||
    in
 | 
			
		||||
        concat (map subPermutations xs)
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
(* Permutations of subsequences of a list. *)
 | 
			
		||||
fun subsequencePermutations xs = concat (map permutations (nonEmptySubsequences xs))
 | 
			
		||||
 | 
			
		||||
(* The same, but for a string. *)
 | 
			
		||||
fun stringSubsequencePermutations s = map implode (subsequencePermutations (explode s))
 | 
			
		||||
 | 
			
		||||
(* Finds words in `wordList` which matches any permutation of any subsequence
 | 
			
		||||
 * of `word`. *)
 | 
			
		||||
fun findMatchingWords word wordList =
 | 
			
		||||
    let
 | 
			
		||||
        val wordPermutations = stringSubsequencePermutations word
 | 
			
		||||
    in
 | 
			
		||||
        filter (fn testWord =>
 | 
			
		||||
                   exists (fn word => word = testWord)
 | 
			
		||||
                          wordPermutations) wordList
 | 
			
		||||
    end
 | 
			
		||||
#+END_SRC
 | 
			
		||||
 | 
			
		||||
As well as some SML functions to calculate the number of permutations and bytes:
 | 
			
		||||
 | 
			
		||||
#+BEGIN_SRC ocaml
 | 
			
		||||
(* Calculates the factorial. *)
 | 
			
		||||
fun factorial 0 = 1
 | 
			
		||||
  | factorial n = n * factorial (n - 1)
 | 
			
		||||
 | 
			
		||||
(* Calculates the binomial coeffecient. *)
 | 
			
		||||
fun binomc n k = factorial n div (factorial k * factorial (n - k))
 | 
			
		||||
 | 
			
		||||
(* Gives [m, m + 1, ..., n]. *)
 | 
			
		||||
fun upTo m n = if m < n
 | 
			
		||||
               then m :: upTo (m + 1) n
 | 
			
		||||
               else [m]
 | 
			
		||||
 | 
			
		||||
(* Gives the total number of word subsequence permutations for a given word
 | 
			
		||||
 * length. *)
 | 
			
		||||
fun nPermutations len = foldl op+ 0 (map (fn n => factorial n * binomc len n)
 | 
			
		||||
                                         (upTo 1 len))
 | 
			
		||||
 | 
			
		||||
(* Gives the size in bytes for storing all word subsequence permutations for a
 | 
			
		||||
 * given word length in a space-saving way: there are ~len~ arrays, each taking
 | 
			
		||||
 * up space for the pointer to the array and the permutations of subsequences of
 | 
			
		||||
 * length n where ~1 <= n <= len~ and n is unique.
 | 
			
		||||
 *)
 | 
			
		||||
fun nSize len = 8 * len + foldl op+ 0 (
 | 
			
		||||
                map (fn n => (n + 1) * factorial n * binomc len n)
 | 
			
		||||
                    (upTo 1 len))
 | 
			
		||||
#+END_SRC
 | 
			
		||||
 | 
			
		||||
* The alternative
 | 
			
		||||
 | 
			
		||||
Preprocess the dictionary into a clever data structure and don't use up all the
 | 
			
		||||
memory.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#&line
 | 
			
		||||
 | 
			
		||||
Originally published [[http://dikutal.dk/artikler/old-junk-code-word-finder][here]].
 | 
			
		||||
							
								
								
									
										
											BIN
										
									
								
								site/ordfinder/sadcomputer.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								site/ordfinder/sadcomputer.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| 
		 After Width: | Height: | Size: 33 KiB  | 
		Reference in New Issue
	
	Block a user