Home > Archive > Compression > June 2007 > OT: RFC, PRNG
You are viewing an archived Text-only version of the thread.
To view this thread in it's original format and/or if you want to reply to
this thread please [click here]
|
|
| cr88192 2007-05-27, 3:56 am |
| well, I lack much better place to ask about this.
for a project of mine, I have needed a fairly special PRNG. the point is to
have a sufficiently large amount of entropy that it generates
hopefully-unique strings.
additionally, it is important that the generated strings are different each
time the app is run, and are different between different installations.
in general, I am using a 4096-bit state.
the state is preserved in files.
the strings are 96-bit numbers encoded in base48 (seemed sane).
there are a few restrictions on name construction:
only letters, numbers, and '_' are allowed;
it is not allowed to have 2 or more '_' chars in a row ("__");
the sequence can't start with a number, and should not start with '_'.
as such, base48 made sense, since:
I only really have enough chars for this;
using only letters, it is always valid;
base52 will not save any chars.
actually, I guess the comment will be if there is any major flaw with this
algo which would limit its ability to produce hopefully unique names (or
anything else worth comment).
special points to anyone who can guess what these names are used for...
---
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <bgbasm.h>
int basm_rand_pos;
/* default initial state: first 1024 digits of PI */
u32 basm_rand_state[128]=
{
0x31415926, 0x53589793, 0x23846264, 0x33832795,
0x02884197, 0x16939937, 0x51058209, 0x74944592,
0x30781640, 0x62862089, 0x98628034, 0x82534211,
0x70679821, 0x48086513, 0x28230664, 0x70938446,
0x09550582, 0x23172535, 0x94081284, 0x81117450,
0x28410270, 0x19385211, 0x05559644, 0x62294895,
0x49303819, 0x64428810, 0x97566593, 0x34461284,
0x75648233, 0x78678316, 0x52712019, 0x09145648,
0x56692346, 0x03486104, 0x54326648, 0x21339360,
0x72602491, 0x41273724, 0x58700660, 0x63155881,
0x74881520, 0x92096282, 0x92540917, 0x15364367,
0x89259036, 0x00113305, 0x30548820, 0x46652138,
0x41469519, 0x41511609, 0x43305727, 0x03657595,
0x91953092, 0x18611738, 0x19326117, 0x93105118,
0x54807446, 0x23799627, 0x49567351, 0x88575272,
0x48912279, 0x38183011, 0x94912983, 0x36733624,
0x40656643, 0x08602139, 0x49463952, 0x24737190,
0x70217986, 0x09437027, 0x70539217, 0x17629317,
0x67523846, 0x74818467, 0x66940513, 0x20005681,
0x27145263, 0x56082778, 0x57713427, 0x57789609,
0x17363717, 0x87214684, 0x40901224, 0x95343014,
0x65495853, 0x71050792, 0x27968925, 0x89235420,
0x19956112, 0x12902196, 0x08640344, 0x18159813,
0x62977477, 0x13099605, 0x18707211, 0x34999999,
0x83729780, 0x49951059, 0x73173281, 0x60963185,
0x95024459, 0x45534690, 0x83026425, 0x22308253,
0x34468503, 0x52619311, 0x88171010, 0x00313783,
0x87528865, 0x87533208, 0x38142061, 0x71776691,
0x47303598, 0x25349042, 0x87554687, 0x31159562,
0x86388235, 0x37875937, 0x51957781, 0x85778053,
0x21712268, 0x06613001, 0x92787661, 0x11959092,
0x16420198, 0x93809525, 0x72010654, 0x85863278
};
static char *basm_base48=
"ABCDEFGHIJKLMNOPQRSTUVWX"
"abcdefghijklmnopqrstuvwx";
void BASM_InitRand()
{
static char buf[256];
FILE *fd;
long long l;
time_t t;
int i, j, k;
u32 i2;
fd=fopen("randbits.txt", "rt");
if(fd)
{
for(i=0; i<16; i++)
{
fgets(buf, 256, fd);
sscanf(buf, "%X %X %X %X %X %X %X %X",
&basm_rand_state[i*8+0],
&basm_rand_state[i*8+1],
&basm_rand_state[i*8+2],
&basm_rand_state[i*8+3],
&basm_rand_state[i*8+4],
&basm_rand_state[i*8+5],
&basm_rand_state[i*8+6],
&basm_rand_state[i*8+7]);
}
fclose(fd);
}
t=time(NULL);
l=*(u32 *)(&t);
for(i=0; i<128; i++)
{
l=l*65521+basm_rand_state[i];
basm_rand_state[i]=(l>>16)&0xFFFFFFFF;
}
basm_rand_state[0]^=(u32)l;
fd=fopen("randbits.txt", "wt");
if(fd)
{
for(i=0; i<16; i++)
{
for(j=0; j<8; j++)
fprintf(fd, "%08X ", basm_rand_state[i*8+j]);
fprintf(fd, "\n");
}
fclose(fd);
}
}
u32 basm_rand()
{
int i, j, k;
i=basm_rand_pos;
j=(i+127-61)%127;
k=(i+127-31)%127;
basm_rand_state[i]^=basm_rand_state[j];
basm_rand_state[i]^=basm_rand_state[k];
basm_rand_pos=(i+1)%127;
return(basm_rand_state[i]);
}
char *basm_rand_key12()
{
static char buf[16];
u32 i;
i=basm_rand();
buf[0]=basm_base48[(i/254803968)%48];
buf[1]=basm_base48[(i/5308416)%48];
buf[2]=basm_base48[(i/110592)%48];
buf[3]=basm_base48[(i/2304)%48];
buf[4]=basm_base48[(i/48)%48];
buf[5]=basm_base48[ i%48];
i=basm_rand();
buf[ 6]=basm_base48[(i/254803968)%48];
buf[ 7]=basm_base48[(i/5308416)%48];
buf[ 8]=basm_base48[(i/110592)%48];
buf[ 9]=basm_base48[(i/2304)%48];
buf[10]=basm_base48[(i/48)%48];
buf[11]=basm_base48[ i%48];
buf[12]=0;
return(buf);
}
char *basm_rand_key18()
{
static char buf[24];
u32 i;
i=basm_rand();
buf[0]=basm_base48[(i/254803968)%48];
buf[1]=basm_base48[(i/5308416)%48];
buf[2]=basm_base48[(i/110592)%48];
buf[3]=basm_base48[(i/2304)%48];
buf[4]=basm_base48[(i/48)%48];
buf[5]=basm_base48[ i%48];
i=basm_rand();
buf[ 6]=basm_base48[(i/254803968)%48];
buf[ 7]=basm_base48[(i/5308416)%48];
buf[ 8]=basm_base48[(i/110592)%48];
buf[ 9]=basm_base48[(i/2304)%48];
buf[10]=basm_base48[(i/48)%48];
buf[11]=basm_base48[ i%48];
i=basm_rand();
buf[12]=basm_base48[(i/254803968)%48];
buf[13]=basm_base48[(i/5308416)%48];
buf[14]=basm_base48[(i/110592)%48];
buf[15]=basm_base48[(i/2304)%48];
buf[16]=basm_base48[(i/48)%48];
buf[17]=basm_base48[ i%48];
buf[18]=0;
return(buf);
}
| |
| Josiah Carlson 2007-05-27, 6:55 pm |
| cr88192 wrote:
> well, I lack much better place to ask about this.
>
> for a project of mine, I have needed a fairly special PRNG. the point is to
> have a sufficiently large amount of entropy that it generates
> hopefully-unique strings.
>
> additionally, it is important that the generated strings are different each
> time the app is run, and are different between different installations.
>
> in general, I am using a 4096-bit state.
> the state is preserved in files.
Don't guess, just use Mersenne Twister. Note that neither Mersenne
Twister, nor your algorithm, are technically guaranteed to not produce
duplicate strings of 96 bits. Then there's the other perspective that
will tell you, "just use /dev/urandom on *nix".
In terms of your requirements (which sounds like a method for producing
variable names), generate a sequence of characters and sanitize it as
you go.
- Josiah
| |
| cr88192 2007-05-27, 6:55 pm |
|
"Josiah Carlson" <josiah.carlson@sbcglobal.net> wrote in message
news:BCk6i.9541$2v1.2979@newssvr14.news.prodigy.net...
> cr88192 wrote:
>
> Don't guess, just use Mersenne Twister. Note that neither Mersenne
> Twister, nor your algorithm, are technically guaranteed to not produce
> duplicate strings of 96 bits. Then there's the other perspective that
> will tell you, "just use /dev/urandom on *nix".
>
2 problems:
I don't like using other peoples' code if avoidable (I am fussy about this);
I don't have access to '/dev/urandom', because in part, I am currently
running on windows.
I am mostly just hoping that the probability of clashes is sufficiently low
that they are unlikely in practice (anything much beyond 1.0/2^32 would
probably be fine).
me just hoping this has sufficient entropy.
that is part of why I use a load, mutate, and save approach.
the hope is that with repeated runs the entropy will increase making it more
random.
another previous approach for comming up with random numbers (typically used
as seeds), was to do a bunch of tweaky behavior involving timing (ie: with
clock). the issue though is that this approach takes up some small amount of
time (but is at least fairly good about being random IME).
but, yeah, usually for things like this (small and simple) I write a
specialized version for each case where the need arises (and sometimes
different uses combine to form interesting occurances).
what I had hoped for was more commentary on the algo specifics, ie, any
obvious flaws, ...
> In terms of your requirements (which sounds like a method for producing
> variable names), generate a sequence of characters and sanitize it as you
> go.
>
yeah, basically.
they are for compiler-generated names, aka a gensym mechanism.
in the past, I had used a special prefix and sequential numbers, but this
wont work if the files are stored externally.
ly, schemes with a higher base, don't really save many chars.
base64 would save 2-chars over base48, but base32 would be 3-chars longer.
shorter names are better, but so is avoiding clashes, so it is a tradeoff...
> - Josiah
| |
| Ben Rudiak-Gould 2007-05-27, 9:56 pm |
| cr88192 wrote:
There's nothing special about that -- any PRNG worthy of the name satisfies
that criterion, if you ask it for enough bits (and no PRNG can possibly
satisfy it if you don't ask for enough).
PRNGs have been extensively studied, and some good ones are known, and some
of those have public domain C implementations. You should use one of them.
You shouldn't try to design your own, because it's very hard. (Actually,
your requirements are weak enough that even a weak PRNG /might/ satisfy
them, but there's still no reason not to use a good one.)
[color=darkred]
> I don't have access to '/dev/urandom', because in part, I am currently
> running on windows.
The Windows equivalent is CryptGenRandom(). You should probably use it,
because it's probably the most easily obtainable source of good seed
randomness on Windows, and you need that no matter which PRNG you use.
> I am mostly just hoping that the probability of clashes is sufficiently low
> that they are unlikely in practice (anything much beyond 1.0/2^32 would
> probably be fine).
With any decent PRNG, you'll need to generate around 2^48 96-bit numbers
before you get a clash (birthday paradox). In certain situations you can
guarantee 2^96 distinct numbers (in a pseudorandom order) before a repeat,
but that won't work if you have many installations running in parallel.
You also need at least 96 bits of /good/ entropy to seed your generator, or
else that will be the limiting factor. The entropy you get from timing
tricks is not very good.
> they are for compiler-generated names, aka a gensym mechanism.
> in the past, I had used a special prefix and sequential numbers, but this
> wont work if the files are stored externally.
It will if the prefix is random. At program startup get some random bits
from CryptGenRandom, and then simply append increasing natural numbers to
the end. This will work just as well for avoiding collisions, and you won't
need a PRNG at all.
-- Ben
| |
| Josiah Carlson 2007-05-27, 9:56 pm |
| cr88192 wrote:
> "Josiah Carlson" <josiah.carlson@sbcglobal.net> wrote in message
> news:BCk6i.9541$2v1.2979@newssvr14.news.prodigy.net...
>
> 2 problems:
> I don't like using other peoples' code if avoidable (I am fussy about this);
Mersenne Twister has been tested, verfied, etc., for about a decade.
While I can't guarantee that
> I don't have access to '/dev/urandom', because in part, I am currently
> running on windows.
Install Bochs, VMWare, or VirtualBox and one of the minimal linux/bsd
installations. Have it running all the time with a server that offers
random bytes from its /dev/urandom via a socket.
> that is part of why I use a load, mutate, and save approach.
> the hope is that with repeated runs the entropy will increase making it more
> random.
Not likely. All you are doing is mixing and re-mixing previously
existing bits. You are also using the decimal digits of pi rather than
the hex digits of pi as your hex initialization vector. I don't know if
it will matter, considering your mixing of fixed state without any
further seeding ability.
> another previous approach for comming up with random numbers (typically used
> as seeds), was to do a bunch of tweaky behavior involving timing (ie: with
> clock). the issue though is that this approach takes up some small amount of
> time (but is at least fairly good about being random IME).
That is basically what /dev/urandom does on *nix.
> what I had hoped for was more commentary on the algo specifics, ie, any
> obvious flaws, ...
You are reinventing the wheel without any guarantee that it works well,
relying on your hope that mixing the same bits over and over using a
fairly arbitrary mixing method will increase entropy. sci.crypto may be
able to offer better commentary, but I imagine they will offer
/dev/urandom, Mersenne Twister, or tell you that you should make your
own PRNG that plugs into your serial port.
>
> yeah, basically.
>
> they are for compiler-generated names, aka a gensym mechanism.
> in the past, I had used a special prefix and sequential numbers, but this
> wont work if the files are stored externally.
>
> ly, schemes with a higher base, don't really save many chars.
> base64 would save 2-chars over base48, but base32 would be 3-chars longer.
>
> shorter names are better, but so is avoiding clashes, so it is a tradeoff...
Here is a generation mechanism that is guaranteed to produce different
symbols of 15 characters on a single machine as long as the clock isn't
changed, you aren't requesting two symbols in two threads of the same
process, and you aren't generating 2**32 symbols each second. The below
uses 80-bit integers (or really Python's infinite precision integers),
but you can encode pid+count, then the time, and it will still come out
to 15 digits (allowing you to use a 64 then 32 bit integer).
import os
import time
lasttime = 0
count = 0
pid = os.getpid()
symbols = \
'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklm
nopqrstuvwxyz0123456789_'
rsymbols = ''.join(reversed(symbols))
def gen_symbol():
global lasttime, count
s = [symbols, rsymbols]
t = int(time.time())
if t != lasttime:
count = 0
lasttime = t
#32 + 32 + 16 = 80 bits
value_to_pack = (t&0xffffffffL)*(2**48) + count*(2**16) + pid
count += 1
out = symbols[value_to_pack&31]
value_to_pack >>= 5
for i in xrange(14):
out += s[i&1][value_to_pack%48]
value_to_pack //= 48
return out
To guarantee that we never get two _ in a row, we alternate between
choosing encodings forwards or backwards through the symbol list. Some
outputs:
Qcdq2hWWhT3WrV9
QMir2hWWhT3WrV9
Qsos2hWWhT3WrV9
Qctt2hWWhT3WrV9
QMyu2hWWhT3WrV9
QcdS2i9WST3WrV9
QMiT2i9WST3WrV9
QsoU2i9WST3WrV9
QctV2i9WST3WrV9
QMyW2i9WST3WrV9
- Josiah
| |
| Josiah Carlson 2007-05-28, 3:56 am |
| Josiah Carlson wrote:
> cr88192 wrote:
>
> Mersenne Twister has been tested, verfied, etc., for about a decade.
> While I can't guarantee that
.... you won't run into problems with your particular use-case, I have
seen it replace linear congruency generators bit by bit over the years.
- Josiah
| |
| cr88192 2007-05-28, 3:56 am |
|
"Ben Rudiak-Gould" <br276deleteme@cam.ac.uk> wrote in message
news:f3d8ju$ddk$1@gemini.csx.cam.ac.uk...
> cr88192 wrote:
>
> There's nothing special about that -- any PRNG worthy of the name
> satisfies that criterion, if you ask it for enough bits (and no PRNG can
> possibly satisfy it if you don't ask for enough).
>
yes, but the internal entropy of the PRNG limits how unique of a string one
can get.
for example, it would be useless to ask for a 96-bit value from a typical
rand implementation (it only having a 64-bit internal state).
> PRNGs have been extensively studied, and some good ones are known, and
> some of those have public domain C implementations. You should use one of
> them. You shouldn't try to design your own, because it's very hard.
> (Actually, your requirements are weak enough that even a weak PRNG /might/
> satisfy them, but there's still no reason not to use a good one.)
>
yes, a weak one will work.
intuitively, it should be good enough.
also, the core is derived from algos that have worked fairly well for me in
practice.
>
> The Windows equivalent is CryptGenRandom(). You should probably use it,
> because it's probably the most easily obtainable source of good seed
> randomness on Windows, and you need that no matter which PRNG you use.
>
possible, but I like keeping anything OS dependent stuff hidden away.
luckily, this lib does have an os-dependent section, so I could probably put
it there...
>
> With any decent PRNG, you'll need to generate around 2^48 96-bit numbers
> before you get a clash (birthday paradox). In certain situations you can
> guarantee 2^96 distinct numbers (in a pseudorandom order) before a repeat,
> but that won't work if you have many installations running in parallel.
>
yes.
> You also need at least 96 bits of /good/ entropy to seed your generator,
> or else that will be the limiting factor. The entropy you get from timing
> tricks is not very good.
>
yes, that is a problem.
that is why I had hoped that the combination of loading/rehashing/and
storing, each time adding a little more entropy, would hopefully cause the
internal entropy to become ever larger.
>
> It will if the prefix is random. At program startup get some random bits
> from CryptGenRandom, and then simply append increasing natural numbers to
> the end. This will work just as well for avoiding collisions, and you
> won't need a PRNG at all.
>
possible thought, dunno though (could allow shorter names, such as instead
using a 64-bit prefix).
the current scheme will probably work ok though.
actually, elsewhere at one point a scheme I had used was to use 'time()' to
get an integer to be used as a prefix, and sequentially generated indices on
the end of this.
> -- Ben
| |
| cr88192 2007-05-28, 3:56 am |
|
"Josiah Carlson" <josiah.carlson@sbcglobal.net> wrote in message
news:ZJq6i.7748$4Y.2620@newssvr19.news.prodigy.net...
> cr88192 wrote:
>
> Mersenne Twister has been tested, verfied, etc., for about a decade. While
> I can't guarantee that
>
yes, but it is still someone else's code...
>
>
>
> Install Bochs, VMWare, or VirtualBox and one of the minimal linux/bsd
> installations. Have it running all the time with a server that offers
> random bytes from its /dev/urandom via a socket.
>
impractical...
>
>
> Not likely. All you are doing is mixing and re-mixing previously existing
> bits. You are also using the decimal digits of pi rather than the hex
> digits of pi as your hex initialization vector. I don't know if it will
> matter, considering your mixing of fixed state without any further seeding
> ability.
>
actually, that is just for the first run, if there is no file to load from.
otherwise, it mixes whatever was in the file, and adds in whatever bits come
from 'time()' (not much entropy, but does give some, depending on exactly
when and how often the app is run...).
someone else suggested another function with which to grab bits...
this should allow better accumulation...
alternatively, assuming the OS knows what it is doing, I could discard the
files, but I am uncertain here.
>
>
> That is basically what /dev/urandom does on *nix.
>
ok.
>
>
> You are reinventing the wheel without any guarantee that it works well,
> relying on your hope that mixing the same bits over and over using a
> fairly arbitrary mixing method will increase entropy. sci.crypto may be
> able to offer better commentary, but I imagine they will offer
> /dev/urandom, Mersenne Twister, or tell you that you should make your own
> PRNG that plugs into your serial port.
>
as noted, I do add new bits (from 'time()'), but that line may have been
missed...
>
> Here is a generation mechanism that is guaranteed to produce different
> symbols of 15 characters on a single machine as long as the clock isn't
> changed, you aren't requesting two symbols in two threads of the same
> process, and you aren't generating 2**32 symbols each second. The below
> uses 80-bit integers (or really Python's infinite precision integers), but
> you can encode pid+count, then the time, and it will still come out to 15
> digits (allowing you to use a 64 then 32 bit integer).
>
>
> import os
> import time
>
> lasttime = 0
> count = 0
> pid = os.getpid()
>
> symbols = \
> 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklm
nopqrstuvwxyz0123456789_'
> rsymbols = ''.join(reversed(symbols))
>
> def gen_symbol():
> global lasttime, count
> s = [symbols, rsymbols]
> t = int(time.time())
> if t != lasttime:
> count = 0
> lasttime = t
>
> #32 + 32 + 16 = 80 bits
> value_to_pack = (t&0xffffffffL)*(2**48) + count*(2**16) + pid
> count += 1
>
> out = symbols[value_to_pack&31]
> value_to_pack >>= 5
>
> for i in xrange(14):
> out += s[i&1][value_to_pack%48]
> value_to_pack //= 48
>
> return out
>
>
> To guarantee that we never get two _ in a row, we alternate between
> choosing encodings forwards or backwards through the symbol list. Some
> outputs:
>
> Qcdq2hWWhT3WrV9
> QMir2hWWhT3WrV9
> Qsos2hWWhT3WrV9
> Qctt2hWWhT3WrV9
> QMyu2hWWhT3WrV9
> QcdS2i9WST3WrV9
> QMiT2i9WST3WrV9
> QsoU2i9WST3WrV9
> QctV2i9WST3WrV9
> QMyW2i9WST3WrV9
>
dunno...
> - Josiah
| |
| cr88192 2007-05-28, 3:56 am |
|
"Josiah Carlson" <josiah.carlson@sbcglobal.net> wrote in message
news:MUr6i.23277$YL5.22826@newssvr29.news.prodigy.net...
> Josiah Carlson wrote:
>
> ... you won't run into problems with your particular use-case, I have seen
> it replace linear congruency generators bit by bit over the years.
>
but, if I were just going to use other peoples' code, I could just blow off
this whole sub-project and use LLVM (or TinyC), or not bother writing
anything at all...
> - Josiah
| |
| cr88192 2007-05-28, 3:56 am |
|
"Josiah Carlson" <josiah.carlson@sbcglobal.net> wrote in message
news:MUr6i.23277$YL5.22826@newssvr29.news.prodigy.net...
> Josiah Carlson wrote:
>
> ... you won't run into problems with your particular use-case, I have seen
> it replace linear congruency generators bit by bit over the years.
>
more so, there are no generators of this sort in the code in question.
there is the 'rehash' algo, which is vaguely similar, but is closer to a
hash (re-shuffles the values, and mixes in any new state, basically whatever
is initially in the value, which is the value pulled from 'time').
the last state is xored back into the first value to hopefully prevent some
entropy loss.
the actual generator is different, and is not based on linear congruency.
u32 basm_rand()
{
int i, j, k;
i=basm_rand_pos;
j=(i+127-61)%127;
k=(i+127-31)%127;
basm_rand_state[i]^=basm_rand_state[j];
basm_rand_state[i]^=basm_rand_state[k];
basm_rand_pos=(i+1)%127;
return(basm_rand_state[i]);
}
of course, this was mostly just an intuition-based variation of an algo I
had seen before...
I am not sure of the period, but it doesn't really matter (since it is ran
for fairly short runs, it mostly just matters that the seed is decent
enough).
but, whatever, probably all this is not very important really.
just me passing time...
> - Josiah
| |
| Josiah Carlson 2007-05-28, 9:55 pm |
| cr88192 wrote:
> "Josiah Carlson" <josiah.carlson@sbcglobal.net> wrote in message
>
> dunno...
What don't you know about? The symbol is encoded directly from process
id, the current time in seconds, and the number of times the function
has been called in the last second. If you had a program that closed
and was restarted within 1 second that just happened to get the same
pid, you could get duplicate entries. This may or may not be an issue,
but it could easily be handled by fetching a few more bits from the
underlying processor performance counters, and/or using a few more bits
from the time (another 8 bits would get you 4ms resolution between
restarts).
- Josiah
| |
| Josiah Carlson 2007-05-28, 9:55 pm |
| cr88192 wrote:
> "Josiah Carlson" <josiah.carlson@sbcglobal.net> wrote in message
>
> but, if I were just going to use other peoples' code, I could just blow off
> this whole sub-project and use LLVM (or TinyC), or not bother writing
> anything at all...
What you do with your time is your business. I was trying to help by
offering you a known good PRNG, and an alternate symbol generation
mechanism, so that you could spend your time writing your compiler and
not your symbol generation routines.
But hey, if you want to run with "not written here" for all code that
can help you on your way, by all means, keep reimplementing it. I'll do
my best to not offer you any more 3rd party code that can help.
- Josiah
| |
| cr88192 2007-05-28, 9:55 pm |
|
"Josiah Carlson" <josiah.carlson@sbcglobal.net> wrote in message
news:PTH6i.8256$4Y.7845@newssvr19.news.prodigy.net...
> cr88192 wrote:
>
> What you do with your time is your business. I was trying to help by
> offering you a known good PRNG, and an alternate symbol generation
> mechanism, so that you could spend your time writing your compiler and not
> your symbol generation routines.
>
> But hey, if you want to run with "not written here" for all code that can
> help you on your way, by all means, keep reimplementing it. I'll do my
> best to not offer you any more 3rd party code that can help.
>
maybe.
yeah, I got bored with it (good enough), but did incorporate another little
RNG trick (the 'jerking off with the timer' approach), which should help a
little (over just calling 'time()' for new bits).
had to find again a good version of this, found eventually in some of my
older genetic-algorithm code (another place where a non-deterministic RNG is
helpful...).
oh well, other powers involved getting more of the dynamic linker/loader
working, ...
but, yeah, my project is backlogged (been going on for months), as I am
doing something not exactly as small as could be hoped.
actually, this lower-end, is analogous to LLVM, but is split into multiple
layers:
an assembler/linker (x86 and x86-64);
a compiler for a custom language (RPNIL, x86-only thus far).
RPNIL is vaguely similar to the LLVM language, but looks and works
differently (typed, psuedo-declarative, abstract stack machine).
above this:
C compiler core;
C optimizer/reducer (focuses on higher-level optimizations);
C parser;
C preprocessor.
>
> - Josiah
| |
| cr88192 2007-05-28, 9:55 pm |
|
"Josiah Carlson" <josiah.carlson@sbcglobal.net> wrote in message
news:wGH6i.23349$YL5.10218@newssvr29.news.prodigy.net...
> cr88192 wrote:
>
> What don't you know about? The symbol is encoded directly from process
> id, the current time in seconds, and the number of times the function has
> been called in the last second. If you had a program that closed and was
> restarted within 1 second that just happened to get the same pid, you
> could get duplicate entries. This may or may not be an issue, but it
> could easily be handled by fetching a few more bits from the underlying
> processor performance counters, and/or using a few more bits from the time
> (another 8 bits would get you 4ms resolution between restarts).
>
potentially, but I like my approach more...
(less complicated).
also, with 96 bits, the probability of clashes is very low, in any case
(just assuming the RNG is working anywhere near as good as assumed).
here are a few outputs (several runs of the app):
AUipRPOttkFqMPPMCv
LJPIwaKicvAvHWoPOM
DfAjWQOWdxntHcTWSD
FiGmhIEgSNMBGRjspR
BsnexRAChlbuPpPEGO
OdoHWkNFAXeLOUrvil
PvhjRKHdgcjUCrafiC
PCkhFSBimWDRMpEojx
FHOXwaGPXgluGqMbnB
GTdhjFGNJDbVOqCGgf
EodSxiPaRAWcAmgckf
KXiBONAMJQlcCpPSSM
and the contents from the current random bits file (changes each run):
E76B7AA9 710BEDFF 4E4D7DB9 E72ED011 445380DA 7FF5D34F 53E8AC35 C193FD2B
A5805BE7 A962D8AE EBE4838B B128314E CFF4474C 17FC1A43 B27E606C EB0550CB
8B7C9BAB 6F5E9CC2 1637C00E 72C9F545 3B7068BC ED2693CC AE8AB1E3 77C34968
44F7AC1F A19CBD93 45655551 4461C0E7 BF2D34B3 010D82A4 72DA59C1 9EF6850F
349E56C8 41819353 BCBC8A97 7B8BC1D2 84A374A7 AF148F0E 4CDA3D68 BC9FCAD9
BD7D25A2 0B4DF618 4C87EC13 701D30EB 9F35FBF3 A7C986CB B1FD6303 F52BA534
47A71ED1 EC076A9C 962CFB6A 2EC7FBCB 3E13CEF0 2BC79B01 0A4F6F58 D4B168C1
F25BF1F8 BE957F7A 54B8068E 0FC6CB0F DE69B20A A9D928E3 352B1F9A 02139C8A
7D64F4F9 9C100C20 E7306AAB DED5BCCF AE49AFEF 799DCF41 AF031C54 DB25A391
CC5DADF8 B47C318C 9E46879D 417B46FD 70C559A9 BE192773 03FA56AC 1B01A009
0AF1A959 0530D2C7 84EC2001 562CB84B ABAD5E14 4EECDE64 3E848A67 E0A3AF32
859B54C1 80A80186 77AF9DB7 9A6D389B 2C352F58 983B5A72 6EF91461 93C9F738
4E6467BA CFD8B58C 87D9F5A7 FFE2B265 B41CD34B 459BEB63 D7413B3D 9E6B789D
30514C07 774495FE 98FA1C88 25E1E351 AB159F99 99562B62 2F563BCD 75C07E64
981DAC1E C261A1B4 3DFC73B1 D1E6C96C 7CE748AD F720270E AC2C1FFD 09681A16
8CFCBAB2 77E42E01 27A31548 C2BB743B 0B3FA4AB FBF1ACF6 E9CE9B7D E8631F89
oh well, ly thus far I have not yet tried running any of this through any
statistical checks (or even the 'testing for compressibility' check), so
yeah...
>
> - Josiah
| |
|
| cr88192 wrote:
> u32 basm_rand()
> {
> int i, j, k;
>
> i=basm_rand_pos;
> j=(i+127-61)%127;
> k=(i+127-31)%127;
> basm_rand_state[i]^=basm_rand_state[j];
> basm_rand_state[i]^=basm_rand_state[k];
> basm_rand_pos=(i+1)%127;
>
> return(basm_rand_state[i]);
> }
The state will definitely repeat every 2147483894 generated numbers I
believe.
Here's the confusing lame program that told me so (note that xoring
twice equals no xor):
#include <stdio.h>
int state[128][128];
int main()
{
memset(state,0,sizeof(state));
unsigned int i,b,x=0,c;
for(i=0;;i++)
{
state[i%127][(i+66)%127]^=1;
state[i%127][(i+96)%127]^=1;
x-=2;
if(x<=0)
{
x=0;
for(b=0;b<127;b++)
{
for(c=0;c<127;c++)
if(state[b][c]==1)
x++;
}
if(x==0)
{
printf("%u\n",i+1);
return 0;
}
}
}
}
| |
| cr88192 2007-05-29, 6:56 pm |
|
"Haikz" <haikz@removediz.kolumbus.fi> wrote in message
news:3CY6i.171042$Y%3.161211@reader1.news.saunalahti.fi...
> cr88192 wrote:
>
> The state will definitely repeat every 2147483894 generated numbers I
> believe.
ok, if true, better may be better.
however, the bigger concern I guess is the input state, as likely at any
given time, this algo will only generate a relatively small number of values
(maybe a few hundred to a few k), so a large period is not all that needed
(though better is better).
maybe the problem is of picking 1/2 based primes, I don't know...
> Here's the confusing lame program that told me so (note that xoring
> twice equals no xor):
>
> #include <stdio.h>
>
> int state[128][128];
>
> int main()
> {
> memset(state,0,sizeof(state));
> unsigned int i,b,x=0,c;
> for(i=0;;i++)
> {
> state[i%127][(i+66)%127]^=1;
> state[i%127][(i+96)%127]^=1;
> x-=2;
> if(x<=0)
> {
> x=0;
> for(b=0;b<127;b++)
> {
> for(c=0;c<127;c++)
> if(state[b][c]==1)
> x++;
> }
> if(x==0)
> {
> printf("%u\n",i+1);
> return 0;
> }
> }
> }
> }
this algo seems confusing, and seems to work in reverse, so I am not
certain.
| |
|
| cr88192 wrote:
>
> this algo seems confusing, and seems to work in reverse, so I am not
> certain.
>
Actually, I noticed it was incorrect, and had a small bug besides that.
Running a fixed version now, but it it's awkwardly slow. (Though I
expect the period to be something too large to find with such a simple
method.)
However, the fact that xor is used (instead of addition) concerns me a
bit as it means that the state can only contain xor-sums of the values
in the initial state. Which initial state values are used in a specific
generated value is constant per stream position. This is something I
actually did verify, and wrote a small app to find which initial state
values are xored together in a particular stream position.
This is a quick verification tool, it can directly calculate the 10000th
generated number without generating the ones before it, and quite a few
initial state values are obviously irrelevant for generating it.
#include <stdio.h>
#include <time.h>
#include <stdint.h>
uint32_t origstate[128];
uint32_t basm_rand_state[128];
uint32_t basm_rand_pos=0;
uint32_t basm_rand()
{
int i, j, k;
i=basm_rand_pos;
j=(i+127-61)%127;
k=(i+127-31)%127;
basm_rand_state[i]^=basm_rand_state[j];
basm_rand_state[i]^=basm_rand_state[k];
basm_rand_pos=(i+1)%127;
return(basm_rand_state[i]);
}
int xorthing[127]=
{
1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0,
0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1,
1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0,
0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
};
int main()
{
uint32_t thingy,thingy2;
int i,b;
srand(time(0));
for(i=0;i<128;i++)
basm_rand_state[i]=rand()+(rand()<<7)+(rand()<<15)+(rand()<<21);
memcpy(origstate,basm_rand_state,sizeof(
basm_rand_state));
for(i=0;i<10000;i++)
thingy=basm_rand();
thingy2=0;
for(i=0;i<127;i++)
if(xorthing[i])
thingy2^=origstate[i];
if(thingy==thingy2)
printf("they do equal");
return 0;
}
Also, here's the tool for finding which initial state values affect a
particular generated value:
#include <stdio.h>
#define num_item 10000
int state[128][128];
int main()
{
int i,b;
memset(state,0,sizeof(state));
for(i=0;i<128;i++)
state[i][i]=1;
for(i=0;i<num_item;i++)
{
for(b=0;b<127;b++)
{
state[i%127][b]^=state[(i+66)%127][b];
state[i%127][b]^=state[(i+96)%127][b];
}
}
printf("{");
for(b=0;b<127;b++)
printf("%2u,",state[(i-1)%127][b]);
printf("}\n");
return 0;
}
| |
| cr88192 2007-05-31, 3:56 am |
|
"Haikz" <haikz@removediz.kolumbus.fi> wrote in message
news:0ce7i.171395$2A2.158924@reader1.news.saunalahti.fi...
> cr88192 wrote:
>
> Actually, I noticed it was incorrect, and had a small bug besides that.
> Running a fixed version now, but it it's awkwardly slow. (Though I
> expect the period to be something too large to find with such a simple
> method.)
>
yes, ok.
well, it is probably doing fairly well then for something I largely just
pulled out of nowhere...
> However, the fact that xor is used (instead of addition) concerns me a
> bit as it means that the state can only contain xor-sums of the values
> in the initial state. Which initial state values are used in a specific
> generated value is constant per stream position. This is something I
> actually did verify, and wrote a small app to find which initial state
> values are xored together in a particular stream position.
>
another possibility would probably be multiplying the values and taking the
middle bits. less efficient, but may also work as a hash function of
sorts...
> This is a quick verification tool, it can directly calculate the 10000th
> generated number without generating the ones before it, and quite a few
> initial state values are obviously irrelevant for generating it.
>
well, seems this algo is hardly 'secure' then...
note that unless you are getting some enjoyment out of this, none of this is
really required of you.
I mostly just beat something together, and partly posted on usenet since I
was feeling lonely and didn't have much better to talk about right then
(lame or wasteful as it is, I guess I do this sometimes...).
ok, maybe I am the fool that does not hold his tongue...
well, I guess elsewhere (since then), I started debating about dynamic
compilation (on comp.lang.misc and alt.lang.asm).
basically, me feeling that traditional compilation technology (GCC, ...),
script/high-level language VMs (Python, Lisp, Smalltalk, ...), and unified
VMs (.Net and the JVM), have fundamental problems that I don't really feel
are well addressed (and that none of these technologies can resolve as they
are on their own).
the 'future' may be at the same time all, yet none, of these...
but, I am insane and off in my own world it seems, regardless of whether I
am agreed with or not.
unrelated, but maybe interesting...
> #include <stdio.h>
> #include <time.h>
> #include <stdint.h>
>
> uint32_t origstate[128];
> uint32_t basm_rand_state[128];
> uint32_t basm_rand_pos=0;
>
> uint32_t basm_rand()
> {
> int i, j, k;
>
> i=basm_rand_pos;
> j=(i+127-61)%127;
> k=(i+127-31)%127;
> basm_rand_state[i]^=basm_rand_state[j];
> basm_rand_state[i]^=basm_rand_state[k];
> basm_rand_pos=(i+1)%127;
>
> return(basm_rand_state[i]);
> }
>
> int xorthing[127]=
> {
> 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0,
> 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
> 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
> 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1,
> 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
> 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0,
> 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
> 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
> };
>
> int main()
> {
> uint32_t thingy,thingy2;
> int i,b;
> srand(time(0));
>
> for(i=0;i<128;i++)
> basm_rand_state[i]=rand()+(rand()<<7)+(rand()<<15)+(rand()<<21);
> memcpy(origstate,basm_rand_state,sizeof(
basm_rand_state));
>
> for(i=0;i<10000;i++)
> thingy=basm_rand();
>
> thingy2=0;
> for(i=0;i<127;i++)
> if(xorthing[i])
> thingy2^=origstate[i];
>
> if(thingy==thingy2)
> printf("they do equal");
>
> return 0;
> }
>
> Also, here's the tool for finding which initial state values affect a
> particular generated value:
>
> #include <stdio.h>
>
> #define num_item 10000
>
> int state[128][128];
>
> int main()
> {
> int i,b;
>
> memset(state,0,sizeof(state));
> for(i=0;i<128;i++)
> state[i][i]=1;
>
> for(i=0;i<num_item;i++)
> {
> for(b=0;b<127;b++)
> {
> state[i%127][b]^=state[(i+66)%127][b];
> state[i%127][b]^=state[(i+96)%127][b];
> }
> }
> printf("{");
> for(b=0;b<127;b++)
> printf("%2u,",state[(i-1)%127][b]);
> printf("}\n");
>
> return 0;
> }
| |
| macairt 2007-06-01, 6:55 pm |
| On May 27, 6:44 pm, "cr88192" <cr88...@NOSPAM.hotmail.com> wrote in
part:
> for a project of mine, I have needed a fairly special PRNG. the point is to
> have a sufficiently large amount of entropy that it generates
> hopefully-unique strings.
>
> additionally, it is important that the generated strings are different each
> time the app is run, and are different between different installations.
>
> in general, I am using a 4096-bit state.
> the state is preserved in files.
>
> the strings are 96-bit numbers encoded in base48 (seemed sane).
>
> there are a few restrictions on name construction:
> only letters, numbers, and '_' are allowed;
> it is not allowed to have 2 or more '_' chars in a row ("__");
> the sequence can't start with a number, and should not start with '_'.
Look at using an arithmetic feistel network to encode the underlying
permutation first. Do this and you can avoid using lookup tables
altogether.
| |
| cr88192 2007-06-03, 6:55 pm |
|
"macairt" <macairt@hotmail.com> wrote in message
news:1180719701.597786.240650@d30g2000prg.googlegroups.com...
> On May 27, 6:44 pm, "cr88192" <cr88...@NOSPAM.hotmail.com> wrote in
> part:
>
>
> Look at using an arithmetic feistel network to encode the underlying
> permutation first. Do this and you can avoid using lookup tables
> altogether.
>
well, a string containing only letters works well enough. I can easily do
base 48 this way, but don't have enough chars for for 64...
using purely letters avoids all the other hassles, since purely alphabetic
strings are always valid.
but, oh well, misc and idle...
|
|
|
|
|