Solved

one-way data "masking" MD5 sql

Posted on 2016-10-12
26
99 Views
Last Modified: 2016-11-08
Refer to attached sql script which my DBA colleague plan to use to
"mask" sensitive data in the DB (I presume it's Oracle).

Q1:
Is this a 1-way hashing or the 'masked' data can be 'unhashed' back?

Q2:
Is this MD5 the industry practice or there are more secure ways of 'masking' it

Q3:
How do people verify the 'hashed' data?   Do we do 'norows export' & randomly
check the exported tables or is there a way to export out table by table of the
hashed tables?

Q4:
What are the things to look out for in the logs to verify this 'masking' process
completed successfully or how can I amend this script further so that it will
log down its activities for verification?

Q5:
How does the script ensure that the keys that link the various tables are 'hashed'
together so that it does not break the entity relationship & the 'masked' database
can still be used by the vendor we're sending this database to?

Q6:
How do people normally send such a 'masked' Oracle DB out : by encrypted tapes
or USB HDD or slowly via sftp over Internet?
0
Comment
Question by:sunhux
  • 8
  • 7
  • 7
  • +2
26 Comments
 

Author Comment

by:sunhux
ID: 41840746
Somehow couldn't attach the file, so pasted below:


drop table ENTITY_REFERENCE;


create table ENTITY_REFERENCE
(entity_code varchar2(50) NOT NULL,
entity_code_md5_key varchar2(50) NOT NULL,
entity_desc varchar2(100) NULL,
entity_desc_md5_key varchar2(100) NULL
);

insert into ENTITY_REFERENCE
SELECT entity_code,rawtohex(DBMS_OBFUSCATION_TOOLKIT.md5 (input => UTL_RAW.cast_to_raw(entity_code)))entity_code_md5_key,
entity_desc,rawtohex(DBMS_OBFUSCATION_TOOLKIT.md5 (input => UTL_RAW.cast_to_raw(coalesce(entity_desc,'blank'))))entity_desc_md5_key
FROM ENTITY;




------------------------------------------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------------------------------------

--select distinct table_name from SECURITY_REFERENCE

drop table SECURITY_REFERENCE;

create table SECURITY_REFERENCE 
(table_name varchar2(50) NOT NULL,
SECURITY_REFERENCE varchar2(101) NULL,
SECURITY_REFERENCE_md5_key varchar2(101) NULL
);

select table_name, count(1) from SECURITY_REFERENCE group by table_name
select * from SECURITY_REFERENCE 

insert into SECURITY_REFERENCE 
select distinct 'SECURITY_POSITIONS' table_name, SECURITY_REFERENCE, rawtohex(DBMS_OBFUSCATION_TOOLKIT.md5 (input => UTL_RAW.cast_to_raw(SECURITY_REFERENCE))) from SECURITY_POSITIONS_MASKTEST;
insert into SECURITY_REFERENCE 
select distinct'REPO' table_name, SECURITY_REFERENCE, rawtohex(DBMS_OBFUSCATION_TOOLKIT.md5 (input => UTL_RAW.cast_to_raw(SECURITY_REFERENCE))) from REPO_MASKTEST;
insert into SECURITY_REFERENCE 
select distinct'SECURITY_S' table_name, SECURITY_REFERENCE, rawtohex(DBMS_OBFUSCATION_TOOLKIT.md5 (input => UTL_RAW.cast_to_raw(SECURITY_REFERENCE))) from SECURITY_MASKTEST;
insert into SECURITY_REFERENCE 
select distinct'SECURITY_A' table_name, attribute_2, rawtohex(DBMS_OBFUSCATION_TOOLKIT.md5 (input => UTL_RAW.cast_to_raw(attribute_2))) from SECURITY_MASKTEST;
insert into SECURITY_REFERENCE 
select distinct'COLLATERAL' table_name, SECURITY_REFERENCE, rawtohex(DBMS_OBFUSCATION_TOOLKIT.md5 (input => UTL_RAW.cast_to_raw(SECURITY_REFERENCE))) from SECURITY_MASKTEST;


------------------------------------------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------------------------------------

--drop table CONTRACT_REFRENCE
--drop table CONTRACT_REFERENCE

create table CONTRACT_REFERENCE
(table_name varchar2(50) NOT NULL,
CONTRACT_REFERENCE varchar2(101) NULL,
CONTRACT_REFERENCE_md5_key varchar2(101) NULL
);

insert into CONTRACT_REFERENCE 
select distinct 'LOANDEPO' table_name, CONTRACT_REFERENCE, rawtohex(DBMS_OBFUSCATION_TOOLKIT.md5 (input => UTL_RAW.cast_to_raw(CONTRACT_REFERENCE))) from LOANDEPO_MASKTEST;
insert into CONTRACT_REFERENCE 
select distinct 'ACCOUNT' table_name, ACCOUNT_REFERENCE, rawtohex(DBMS_OBFUSCATION_TOOLKIT.md5 (input => UTL_RAW.cast_to_raw(ACCOUNT_REFERENCE))) from ACCOUNT_MASKTEST;
insert into CONTRACT_REFERENCE 
select distinct 'SECURITY_POSITIONS' table_name, CONTRACT_REFERENCE, rawtohex(DBMS_OBFUSCATION_TOOLKIT.md5 (input => UTL_RAW.cast_to_raw(CONTRACT_REFERENCE))) from SECURITY_POSITIONS_MASKTEST;
insert into CONTRACT_REFERENCE 
select distinct 'SWAP' table_name, CONTRACT_REFERENCE, rawtohex(DBMS_OBFUSCATION_TOOLKIT.md5 (input => UTL_RAW.cast_to_raw(CONTRACT_REFERENCE))) from SWAP_MASKTEST; 
insert into CONTRACT_REFERENCE 
select distinct 'FOREX' table_name, CONTRACT_REFERENCE, rawtohex(DBMS_OBFUSCATION_TOOLKIT.md5 (input => UTL_RAW.cast_to_raw(CONTRACT_REFERENCE))) from FOREX_MASKTEST;
insert into CONTRACT_REFERENCE 
select distinct 'DEAL_IMPORT' table_name, CONTRACT_REFERENCE, rawtohex(DBMS_OBFUSCATION_TOOLKIT.md5 (input => UTL_RAW.cast_to_raw(CONTRACT_REFERENCE))) from DEAL_IMPORT_MASKTEST;
insert into CONTRACT_REFERENCE 
select distinct 'EXCHANGE_OPTION' table_name, CONTRACT_REFERENCE, rawtohex(DBMS_OBFUSCATION_TOOLKIT.md5 (input => UTL_RAW.cast_to_raw(CONTRACT_REFERENCE))) from EXCHANGE_OPTION_MASKTEST;
insert into CONTRACT_REFERENCE 
select distinct 'FACILITY' table_name, CONTRACT_REFERENCE, rawtohex(DBMS_OBFUSCATION_TOOLKIT.md5 (input => UTL_RAW.cast_to_raw(CONTRACT_REFERENCE))) from FACILITY_MASKTEST;
insert into CONTRACT_REFERENCE 
select distinct 'REPO' table_name, CONTRACT_REFERENCE, rawtohex(DBMS_OBFUSCATION_TOOLKIT.md5 (input => UTL_RAW.cast_to_raw(CONTRACT_REFERENCE))) from REPO_MASKTEST;
insert into CONTRACT_REFERENCE 
select distinct 'CREDIT_RATING' table_name, REFERENCE, rawtohex(DBMS_OBFUSCATION_TOOLKIT.md5 (input => UTL_RAW.cast_to_raw(REFERENCE))) from CREDIT_RATING_MASKTEST;
insert into CONTRACT_REFERENCE 
select distinct 'ENTITY_LINKS_M' table_name, mother_code, rawtohex(DBMS_OBFUSCATION_TOOLKIT.md5 (input => UTL_RAW.cast_to_raw(mother_code))) from entity_links_MASKTEST;
insert into CONTRACT_REFERENCE 
select distinct 'ENTITY_LINKS_C' table_name, child_code, rawtohex(DBMS_OBFUSCATION_TOOLKIT.md5 (input => UTL_RAW.cast_to_raw(child_code))) from entity_links_MASKTEST;
insert into CONTRACT_REFERENCE 
select distinct 'COLLATERAL' table_name, CONTRACT_REFERENCE, rawtohex(DBMS_OBFUSCATION_TOOLKIT.md5 (input => UTL_RAW.cast_to_raw(CONTRACT_REFERENCE))) from COLLATERAL_MASKTEST;

------------------------------------------------------------------------------------------------------------------------------------------------------------

--select table_name, count(1) from SECURITY_REFRENCE group by table_name order by 1
--select table_name, count(1) from CONTRACT_REFERENCE group by table_name order by 1
--select * from ENTITY_REFRENCE

------------------------------------------------------------------------------------------------------------
------------------------------------ENTITY_CODE JOIN--------------------------------------------------------

update ENTITY_MASKTEST E
set E.ENTITY_CODE = (select ER.ENTITY_CODE_MD5_KEY from ENTITY_REFERENCE ER where ER.ENTITY_CODE = E.ENTITY_CODE);
--select * from ENTITY_MASKTEST


update LOANDEPO_MASKTEST  LN
set LN.COUNTERPARTY_CODE = (select ER.ENTITY_CODE_MD5_KEY from ENTITY_REFERENCE ER where ER.ENTITY_CODE = LN.COUNTERPARTY_CODE);
--select COUNTERPARTY_CODE from LOANDEPO_MASKTEST  


update ACCOUNT_MASKTEST  AC
set AC.COUNTERPARTY_CODE = (select ER.ENTITY_CODE_MD5_KEY from ENTITY_REFERENCE ER where ER.ENTITY_CODE = AC.COUNTERPARTY_CODE);
--select COUNTERPARTY_CODE from ACCOUNT_MASKTEST  
select count(1) from ENTITY_REFERENCE


update SECURITY_MASKTEST  SC
set SC.ISSUER_CODE = (select ER.ENTITY_CODE_MD5_KEY from ENTITY_REFERENCE ER where ER.ENTITY_CODE = SC.ISSUER_CODE);
--select ISSUER_CODE from SECURITY_MASKTEST 


update SECURITY_POSITIONS_MASKTEST  SCP
set SCP.CPTY_CODE = (select ER.ENTITY_CODE_MD5_KEY from ENTITY_REFERENCE ER where ER.ENTITY_CODE = SCP.CPTY_CODE);
--select CPTY_CODE from SECURITY_POSITIONS_MASKTEST  


update SWAP_MASKTEST SW
set SW.COUNTERPARTY_CODE = (select ER.ENTITY_CODE_MD5_KEY from ENTITY_REFERENCE ER where ER.ENTITY_CODE = SW.COUNTERPARTY_CODE);
--select COUNTERPARTY_CODE from SWAP_MASKTEST  


update FOREX_MASKTEST FX
set FX.COUNTERPARTY_CODE = (select ER.ENTITY_CODE_MD5_KEY from ENTITY_REFERENCE ER where ER.ENTITY_CODE = FX.COUNTERPARTY_CODE);
--select COUNTERPARTY_CODE from FOREX_MASKTEST  


update DEAL_IMPORT_MASKTEST DI
set DI.COUNTERPARTY = (select ER.ENTITY_CODE_MD5_KEY from ENTITY_REFERENCE ER where ER.ENTITY_CODE = DI.COUNTERPARTY);
--select COUNTERPARTY from DEAL_IMPORT_MASKTEST  


update EXCHANGE_OPTION_MASKTEST EO
set EO.COUNTERPARTY_CODE = (select ER.ENTITY_CODE_MD5_KEY from ENTITY_REFERENCE ER where ER.ENTITY_CODE = EO.COUNTERPARTY_CODE);
--select COUNTERPARTY_CODE from EXCHANGE_OPTION_MASKTEST  


update FACILITY_MASKTEST FAC
set FAC.COUNTERPARTY_CODE = (select ER.ENTITY_CODE_MD5_KEY from ENTITY_REFERENCE ER where ER.ENTITY_CODE = FAC.COUNTERPARTY_CODE);
--select COUNTERPARTY_CODE from FACILITY_MASKTEST 


update REPO_MASKTEST R
set R.COUNTERPARTY_CODE = (select ER.ENTITY_CODE_MD5_KEY from ENTITY_REFERENCE ER where ER.ENTITY_CODE = R.COUNTERPARTY_CODE);
--select COUNTERPARTY_CODE from REPO_MASKTEST


update ISSUER_CREDIT_RATINGS_MASKTEST ICR 
set ICR.ENTITY_CODE = (select ER.ENTITY_CODE_MD5_KEY from ENTITY_REFERENCE ER where ER.ENTITY_CODE = ICR.ENTITY_CODE);
--select distinct ENTITY_CODE from ISSUER_CREDIT_RATINGS_MASKTEST;


update COMPANIES_MASKTEST COM 
set COM.ENTITY_CODE = (select ER.ENTITY_CODE_MD5_KEY from ENTITY_REFERENCE ER where ER.ENTITY_CODE = COM.ENTITY_CODE);
--select ENTITY_CODE from COMPANIES 


update COUNTRY_MASKTEST CTRY
set CTRY.ENTITY_CODE = (select ER.ENTITY_CODE_MD5_KEY from ENTITY_REFERENCE ER where ER.ENTITY_CODE = CTRY.ENTITY_CODE);
--select ENTITY_CODE from COUNTRY_MASKTEST


update COLLATERAL_MASKTEST CLT
set CLT.COUNTERPARTY = (select ER.ENTITY_CODE_MD5_KEY from ENTITY_REFERENCE ER where ER.ENTITY_CODE = CLT.COUNTERPARTY);
--select COUNTERPARTY from COLLATERAL_MASKTEST 
select * from COLLATERAL_MASKTEST


-----------------------------------------------------------------------------------------------------------------
------------------------------------ENTITY_CODE_DESC JOIN--------------------------------------------------------

Open in new window

0
 
LVL 76

Expert Comment

by:slightwv (䄆 Netminder)
ID: 41840760
I'm not going to want through all your code to see everything it is doing.

Hashing is meant to be one-way.  There is not superposed to be any way to turn it back into the passed value.

I use "meant" and "supposed" because some of the algorithms have been cracked.

It isn't masking.  Those are two different terms that mean two different things.

MD5 should be avoided at all costs:
https://en.wikipedia.org/wiki/Hash_function_security_summary

I think the minimum acceptable these days is SHA256.

What are you asking about "verifying" data?  You query it.  Does it look hashed?  If yes, then the hash worked...
0
 
LVL 73

Expert Comment

by:sdstuber
ID: 41840767
Q1:

You can't "unhash", but you can still "crack" it with sufficient computing power, time and techniques


 Q2:
MD5 is definitely NOT a secure hashing mechanism.  Modern systems (like Oracle itself) uses SHA2 hashing.

https://www.experts-exchange.com/articles/855/How-Oracle-Stores-Passwords.html
or for a more up to date version of this article
https://seanstuber.wordpress.com/2016/03/19/how-oracle-stores-passwords/

Also, the obfuscation toolkit is deprecated, you should use dbms_crypto.  If you're on 11g you might want to use a java stored procedure to do the higher bit sha2 methods.  In 12c dbms_crypto has them built in.

 Q3:
You verify a hash by taking a known value, hashing it and checking if it matches an already known hash.  If they don't, then either the hashing algorithm has changed or the original data is not the same.

 Q4:
I don't know what you're trying to find in your logs

 Q5:
You have to manage FKs yourself.  Which usually means either removing the FKs hashing the parent and child tables then reapplying the FKs
or if the FKs are deferred, then updating the parent and child and then commiting
or, if the FKs are not deferred, then deleting the children, updating the parent with the hash, then reinserting the children with the newly hashed keys

 Q6:
It would depend on the size, the relationship with the vendor and the nature of the data and rules around it.    If it's not that big of a deal if it might get lost then a usb in the mail might be sufficient.  If more security is needed then maybe some secure internet transfer.
If even more security then maybe a usb drive or tapes with secured couriers and signed/biometric authentication for transfer of control.
0
 
LVL 38

Accepted Solution

by:
Adam Brown earned 200 total points
ID: 41840961
MD5 hashing is not sufficient for most purposes at this point. Almost all hashes for values with fewer than 10 characters have been mapped in Rainbow tables, so MD5 hashing will not be a good solution for masking data, as it will only delay the recovery of any information in the DB. Even SHA1 is considered a "weak" hashing algorithm at this point. Further, a DB that has been Obfuscated with MD5 will be effectively unusable if all tables are hashed because the MD5 hashes can't be backward calculated. You would only use MD5 if you wanted to ship a copy of the DB to an entity with only certain values in certain tables are obfuscated to prevent unintentional release, but this, too, is effectively useless because MD5 is so thoroughly mapped.

You'll also want to make sure that the version of Oracle this DB runs in supports the DBMS_OBFUSCATION_TOOLKIT, since not all versions do. 10.2 does, and if that's your version, the only secure method you have available is 3DES encryption. And that's kind of Iffy, because 3DES is just the same as DES run three times. DES was broken completely in the mid 90s, so 3DES is considered less secure now.

Newer versions of Oracle use the DBMS_CRYPTO function instead and DBMS_OBFUSCATION_TOOLKIT has been deprecated, which would cause your query to fail on those versions. DBMS_CRYPTO in the latest version of Oracle supports up to SHA256 keys and AES, so it's a much more secure method of doing things.
0
 

Author Comment

by:sunhux
ID: 41841320
Thanks, so
   DBMS_OBFUSCATION_TOOLKIT.md5
should be
   DBMS_CRYPTO  ??
0
 

Author Comment

by:sunhux
ID: 41841715
Feedback from DBA:

The size of the encrypted key for DBMS_CRYPTO  SHA256 algorithm  is 64 character while for MD5 it is 32 character. The 64 char exceeds the size of fields ENTITY_CODE and CONTRACT_REFERENCE (only for ENTITY_LINKS table) :

so what do we do?  Will it break anything if we truncate the length of the key?


I have a clarification on the 64 & 32 chars that our DBA says:
from the link
  https://docs.oracle.com/cd/E18283_01/appdev.112/e16760/d_crypto.htm#BJFGFDFG
it says:

HASH_MD4
Produces a 128-bit hash, or message digest of the input message

HASH_MD5
Also produces a 128-bit hash, but is more complex than MD4

HASH_SH1
Secure Hash Algorithm (SHA). Produces a 160-bit hash.
 

So DBMS_CRYPTO  is using HASH_SH1 ?
So is 160bit = 64 char ?
0
 
LVL 76

Assisted Solution

by:slightwv (䄆 Netminder)
slightwv (䄆 Netminder) earned 140 total points
ID: 41841828
>>so what do we do?  Will it break anything if we truncate the length of the key?

You cannot truncate hashes. Doing so might result in a collision where two entries get the same value when they shouldn't.

>>The size of the encrypted key for DBMS_CRYPTO  SHA256 algorithm  is 64 character while for MD5 it is 32 character

Which is why one is safer to use than the other.  You can use MD5 but doing so can make the data viewable by those that receive it.  Might as well just provide the data unhahsed.

>>So is 160bit = 64 char ?

Should be simple math based on your character set.

I get 40 characters with:
select length(dbms_crypto.hash(utl_raw.cast_to_raw('Hello'),3)) from dual;
0
 

Author Comment

by:sunhux
ID: 41841901
So instead of truncating the hash, we should increase the length of the
hash field (I'm no DBA, so I don't know if I've used the right term here)?

>You can use MD5 but doing so can make the data viewable by those that receive it.
Don't understand the above;  does it mean the hashed data (or database) can
be read by the external vendor whom we are sending the hashed DB to?
0
 
LVL 76

Assisted Solution

by:slightwv (䄆 Netminder)
slightwv (䄆 Netminder) earned 140 total points
ID: 41841912
>>So instead of truncating the hash, we should increase the length of the  hash field

Yes.

>> does it mean the hashed data (or database) can  be read by the external vendor whom we are sending the hashed DB to?

Read the post above:
MD5 hashing is not sufficient for most purposes at this point. Almost all hashes for values with fewer than 10 characters have been mapped in Rainbow tables, so MD5 hashing will not be a good solution for masking data, as it will only delay the recovery of any information in the DB.

Don't take our word for it.  Go out and look around the web:  MD5 is no longer considered "safe".

>>can  be read by the external vendor whom we are sending the hashed DB to?

Can I ask why you are sending hashed data to someone that cannot see the actual data?

Seems like a lot of work for nothing.  Just don't send them any data in the first place or replace all the valid data with random data not hashed data.
0
 

Author Comment

by:sunhux
ID: 41841934
As this hashed DB is for vendor's development, I guess collisions wont hamper
development work: so long as truncating it won't increase the security risk or
will it?

http://crypto.stackexchange.com/questions/9435/is-truncating-a-sha512-hash-to-the-first-160-bits-as-secure-as-using-sha1
Referring to above, if SHA256 hash is truncated to MD5's length, what's the chance of
collision?   2 to the power ?
0
 
LVL 76

Assisted Solution

by:slightwv (䄆 Netminder)
slightwv (䄆 Netminder) earned 140 total points
ID: 41841964
>> I guess collisions wont hamper

It can but it depends.  If you have a foreign key relationship in the database a collision can either corrupt that relationship or improperly link rows that shouldn't be linked.

The thing with probability is that unless it is 100%, chance plays a part.  Even if the probability of not having a collision is 99%, are you willing to bet that 1% will not break the project?

It is all about risk and what level you are willing to accept.

I mean, you can use MD5 but you accept the risk that some of the data can be cracked if the developer wants to take the time.

>>so long as truncating it won't increase the security risk or  will it?

If you take a hashed value and truncate it, I'm guessing the data would be uncrackable but I cannot say with 100% certainty.

>>As this hashed DB is for vendor's development

Hashing seems like a LOT of unnecessary work.  Just use random generated data or manually seed the database with dummy data.

To simulate the hash using random data:
If a main table has the string 'Bob' as a value and that is a foreign key in other tables, generate a random 3 character string, dbms_random.string('a',3), then update all the 'Bob' values in all the tables with that.

Then you don't have to worry about the hashed values getting cracked.

Or just provide an empty database and manually insert known dummy data from a script you create.
0
 

Author Comment

by:sunhux
ID: 41841982
>Can I ask why you are sending hashed data to someone that cannot see the actual data?
This outsourced remote applications vendor is helping us to do work but we can't let
them see the actual sensitive data.  I presume the vendor could work with 'fictitious'
data but the keys linking the various relational tables must not break

>hashes for values with fewer than 10 characters have been mapped in Rainbow tables, so
>MD5 hashing will not be a good solution
But our DBA pointed out that MD5 is 32 character; so it's still mappable in Rainbow tables?
0
Do You Know the 4 Main Threat Actor Types?

Do you know the main threat actor types? Most attackers fall into one of four categories, each with their own favored tactics, techniques, and procedures.

 
LVL 76

Assisted Solution

by:slightwv (䄆 Netminder)
slightwv (䄆 Netminder) earned 140 total points
ID: 41842042
>>But our DBA pointed out that MD5 is 32 character; so it's still mappable in Rainbow tables?

Your DBA can debate it all he wants.

Google around:  MD5 is no longer considered reliable.  Period.  End of discussion.

The quote above "hashes for values with fewer than 10 characters have been mapped in Rainbow tables" is the raw data that has been hashed not the length of the hash value.

If you take the word 'Bob' (less than 10 characters) and hash it using MD5, it wouldn't take much work to run that hashed value against a Rainbow table to see the MD5 hashed value means 'Bob'.

>>but the keys linking the various relational tables must not break

That is exactly the point I mentioned above about the chances of your truncation resulting in a collision possible breaking keys.

>>I presume the vendor could work with 'fictitious'  data

It is up to your company to ensure the dummy data you provide is accurate.

>>we can't let  them see the actual sensitive data

Can you not replace it with random data like I suggested above?  I mean if you can hash it and replace the values with a hashed value, why can't you replace it with a randomly generated string?

You would just need to do a little extra work to make sure the random string you generated wasn't previously generated and already used.
0
 
LVL 73

Assisted Solution

by:sdstuber
sdstuber earned 110 total points
ID: 41842046
>>> I presume the vendor could work with 'fictitious' data

That's probably an easier way to go.
If you've got a good generator you don't even need to send data, just send the db structure as an empty db export along with the generator and they can create as much fake data as they want.

This isn't a new problem.  Many companies have some data that must remain private.
We have some that we can send to vendors "open"
We have some that we have to mask.  Sometimes it's just a matter of scrambling a few columns, sometimes we need to use a data generator so nothing is real.
Yes - a data generator can someties be a significant amount of work to build; but once you have it, it's a great tool to have and reusable.  You can reuse it inhouse for data scaling tests.  What happens when you grow by 10%, 20%, 100%, 1000%  ?


>>>But our DBA pointed out that MD5 is 32 character; so it's still mappable in Rainbow tables?

yes, md5 cracking with rainbow tables is well established.
also, md5 hash was never intended to be used for security.  So, it's relatively easy to get around because it was never supposed to be a barrier.
0
 
LVL 73

Assisted Solution

by:sdstuber
sdstuber earned 110 total points
ID: 41842068
Since your concern is making the original data unreadable.  I would go with the hash and truncate method.  To make it more secure,  cycle it a few times.  That is, hash the values, then hash the hash, then hash that hash, do that a bunch of times, add a randomized salt if you want to skew it even more.  Then truncate to the length you need.
You could automate the entire hash cycle and truncation with randomized salt generation on each step so not even you knows what went into the hashing.

If, by chance, you happen to get an FK violation or duplicate key violation,  my guess is they will be seldom and you can manually adjust those as needed.  Since your goal is to create fake data anyway, it doesn't matter at all what you use to resolve the constraint violations.
0
 
LVL 76

Expert Comment

by:slightwv (䄆 Netminder)
ID: 41842075
>>I would go with the hash and truncate method ... hash the values, then hash the hash, then hash that hash, do that a bunch of times

Over dbms_random.string?
0
 
LVL 73

Expert Comment

by:sdstuber
ID: 41842088
>>> Over dbms_random.string?

that will work fine if you order the updates/inserts correctly to maintain FKs.

I was suggesting hashing so that there would be a correlation, albeit irreversible, between original and hashed output.  That way when "slightwv" is hashed, it will always be the same value, and thus preserve FKs.
0
 
LVL 76

Expert Comment

by:slightwv (䄆 Netminder)
ID: 41842101
Unless I'm missing something, which is entirely possible, you would need to hash "slightwv", then manually update ALL the keys/columns manually with the hashed value.

Isn't that the same workload if I generate a random string to replace "slightwv" with?  Still the same number of updates.
0
 
LVL 73

Expert Comment

by:sdstuber
ID: 41842136
yes, but you have to manually maintain that.

that is, you generate a random string for slightwv then you have to go find all rows/columns that have that and update them to the same thing.

if you simply do   col=some_hash(col)  

and "some_hash" returns a consistent value then you don't need to figure anything out and find related values, they'll sort themselves out.


Now - it would be possible to write "some_hash" as

dbms_random.seed(my_parameter);
return dbms_random.string('x',20);

but that  essentially writing our own hash, using a non-secure randomizer.
so, definitely more crackable than SHA
0
 
LVL 38

Assisted Solution

by:Adam Brown
Adam Brown earned 200 total points
ID: 41842150
>hashes for values with fewer than 10 characters have been mapped in Rainbow tables, so
 >MD5 hashing will not be a good solution
 But our DBA pointed out that MD5 is 32 character; so it's still mappable in Rainbow tables?

What I mean is that every pre-hash string with less than 10 characters has had its hash mapped. This means if you hash the word "elephant" using MD5, someone can compare the hash to what exists in a rainbow table and determine the value using just the hash. All MD5 hashes are 32 characters long, but the data those hashes represent can be anything from 1 to an infinite number of characters long. If the data values that you are trying to hash are 32 characters long themselves, MD5 could work as a method for obscuring the data, but there are some inherent weaknesses in the MD5 algorithm that could allow someone to recover data with *some* effort.

Here's what you should consider:
1. You are trying to hide data from being easily determined, either to meet compliance or prevent disclosure to an unauthorized viewer. Hashing is *great* for this.
2. The hashing algorithm you want to use for this purpose depends on what kind of data you are hashing, and the limitations you have to work with.
3. MD5 *can* work for data obfuscation, but only if the data being obscured is longer than a certain number of characters. The more characters there are in the data you're trying to obscure, the less likely it will be that the data is listed in a rainbow table.
4. As I mentioned, if the data you are trying to obscure is 10 characters or less (alphanumeric), there is an extremely high likelihood that the data will be in a rainbow table somewhere.
5. MD5's major weaknesses are not as severe in data obfuscation as they are with password use, since the majority of password systems compare hashes rather than actual values. The password you set on an account is hashed. When you log in with that account, the value you enter is hashed, then that hash is compared with the stored hash of the password you set. If they match, you get access, if not, you don't. MD5 is weak primarily because it's keyspace has been "mapped" very thoroughly, meaning that almost every hash you can generate with MD5 has a known value tied to it. For passwords, this is very bad, because you don't have to actually know a password to log in, you have to know a value that can generate the same hash (This is called a collision).
6. For data obfuscation, collisions are not a major concern. If the hash for a piece of obfuscated data exists in a rainbow table somewhere, but the value tied to that hash in the table isn't the same as the data you have hashed, that is not a concern. They can't obtain the original data just from the hash.
7. In short, you need to consider the length of the pre-hashed values in your database. If they are things like street addresses, MD5 obfuscation is a more feasible choice. For things like a First or last name, Telephone numbers, or birthdates, MD5 obfuscation is essentially useless. The hashes you get may end up with just a collision, but there is too great a chance that the actual data will be listed in a rainbow table.

Does that all make more sense?

Now, the limitation you're working with, where the cells that each piece of data sits in has a 32 character limit to it, can cause some problems when dealing with hashing, but those problems are almost not worth considering for data obfuscation. Putting in the first 32 characters of a 64 character hash string and leaving out the last 32 characters is actually *better* for obfuscation because it hides the hashing algorithm as well, and the mathematical probability of getting the hashed data from half the hash is astronomical.

It is extremely important to note here, though, that when you're working with relational databases, if the table relationships rely on a specific piece of data, hashing that data will seriously screw with the relationship. It doesn't matter what algorithm you use or if the hashes are complete or not.
0
 
LVL 73

Assisted Solution

by:sdstuber
sdstuber earned 110 total points
ID: 41842211
Something like this is what I was envisioning for a hash looping function
This will create a random salt based on your input string and then hash your string over and over again with the salt and then hash the final result one more time before returning a hex string.


CREATE OR REPLACE FUNCTION hashcycle(p_str IN VARCHAR2)
    RETURN VARCHAR2
IS
    v_result   RAW(20);   -- if you change the hash algorithm, change this to be long enough to hold the intermediate results
    v_salt     RAW(10);
BEGIN
    -- If you're on 12c then swap hash_sh1 for hash_sh256, hash_sh384, or hash_sh512

    DBMS_RANDOM.seed(p_str);
    v_salt :=
        UTL_RAW.SUBSTR(
            sys.DBMS_CRYPTO.hash(
                UTL_RAW.cast_to_raw(DBMS_RANDOM.string('x', 20)),
                sys.DBMS_CRYPTO.hash_sh1
            ),
            1,
            10
        );

    v_result := sys.DBMS_CRYPTO.hash(UTL_RAW.cast_to_raw(p_str), sys.DBMS_CRYPTO.hash_sh1);

    FOR i IN 1 .. 100
    LOOP
        v_result :=
            sys.DBMS_CRYPTO.hash(UTL_RAW.CONCAT(v_result, v_salt), sys.DBMS_CRYPTO.hash_sh1);
    END LOOP;

    v_result := sys.DBMS_CRYPTO.hash(v_result, sys.DBMS_CRYPTO.hash_sh1);

    RETURN SUBSTR(RAWTOHEX(v_result), 1, 40);   -- change 40 to whatever truncation limit you want on the final result
END;

Open in new window



then you would simply disable the FKs and do

update some_table, set col1 = hashcycle(col1), col2 = hashcycle(col2), ...

then re-enable the FKs.

If any you get any UK or FK constraint violations then manually adjust those rows/columns until they are valid.  My expection would be this will be rare (probably zero.)
0
 

Author Comment

by:sunhux
ID: 41843650
Our DBA proposes to make those keys that are 'hashed' to have Unique Constraint
so if he detects as collision/duplicate hashed keys, then manually adjust which he
thinks will not be many : anyone has any view on our DBA's proposal?
0
 
LVL 76

Expert Comment

by:slightwv (䄆 Netminder)
ID: 41843656
How do you make it a unique constraint if it is the child table of a foreign key constraint in a one-to-many relationship?
0
 
LVL 73

Expert Comment

by:sdstuber
ID: 41843683
The proposal doesn't make any sense to me.

Your application should have constraints in place, hashing the data to mask it does not (or should not) in any way require new constraints to be added.

For example, lets say you have a table of customer names.

My name is Sean, you could hash that to A072A5313C9ACCF4C24B2DC8145937B4
Instead of hashing you could use a random data generator and change my name to Blippo123

In both cases the original name is unrecoverable.

Why would adding a unique constraint help?  What value could that possibly add?

Also, you could have multiple people named "Sean" in your customer base.
Adding a unique constraint would then cause your application to break because you couldn't have multiple people (either hashed or renamed) with the same name in your table, which is a completely artificial and likely erroneous rule.
0
 
LVL 26

Assisted Solution

by:skullnobrains
skullnobrains earned 50 total points
ID: 41849981
<might be offtopic>
i'm just answering to the part relative to md5
md5 is not unsafe. improper use of md5 is. same applies to SHA2.
many authentication mechanisms are still based on md5 and the existence of rainbow tables does not make a difference to them.
any security mechanism involving hashes that relies on the non-existence of rainbow tables is poorly designed whatever the key length.
SHA1 rainbow tables are almost ( maybe already ) complete and SHA2 comes soon.
<offtopic>

as far as constraint go, i'd suggest you add extra columns for the hashed data, then go through all the constraints on the original columns and apply them to the hashed ones, then possibly remove the original columns

note that i don't really see the point of this as a whole and i'd assume a simpler solution to exist. most likely a bunch of views would do the trick
0

Featured Post

Highfive Gives IT Their Time Back

Highfive is so simple that setting up every meeting room takes just minutes and every employee will be able to start or join a call from any room with ease. Never be called into a meeting just to get it started again. This is how video conferencing should work!

Join & Write a Comment

Article by: btan
Provide an easy one stop to quickly get the relevant information on common asked question on Ransomware in Expert Exchange.
This paper addresses the security of Sennheiser DECT Contact Center and Office (CC&O) headsets. It describes the DECT security chain comprised of “Pairing”, “Per Call Authentication” and “Encryption”, which are all part of the standard DECT protocol.
Video by: Steve
Using examples as well as descriptions, step through each of the common simple join types, explaining differences in syntax, differences in expected outputs and showing how the queries run along with the actual outputs based upon a simple set of dem…
Via a live example, show how to restore a database from backup after a simulated disk failure using RMAN.

758 members asked questions and received personalized solutions in the past 7 days.

Join the community of 500,000 technology professionals and ask your questions.

Join & Ask a Question

Need Help in Real-Time?

Connect with top rated Experts

22 Experts available now in Live!

Get 1:1 Help Now