From 13c170e7780a29eb15092e54599e05f1bf2c9cc9 Mon Sep 17 00:00:00 2001 From: Salar Rahmanian Date: Sun, 27 Apr 2025 13:42:13 -0700 Subject: [PATCH] New blog enhancements --- content/post/the-data-surrender-trap/index.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/content/post/the-data-surrender-trap/index.md b/content/post/the-data-surrender-trap/index.md index 418070c..8df51ae 100644 --- a/content/post/the-data-surrender-trap/index.md +++ b/content/post/the-data-surrender-trap/index.md @@ -10,7 +10,7 @@ categories = ["Data Engineering", "Artificial Intelligence", "Data Governance"] [extra] social_media_card = "TheDataSurrenderTrap.svg" toc = true -keywords = ["AI", "Databricks", "Data Governance", "Data Engineering", "Artificial Intelligence", "Data Surrender Trap", "Data Sharing", "Data Security", "Data Privacy", "Open Standards", "Zero-Copy Sharing", "Google Cloud", "AWS", "Microsoft Azure", "Snowflake", "Data Residency", "Data Compliance", "Data Strategy", "Iceberg", "Delta Lake", "Apache Ranger", "Open Policy Agent", "Unity Catalog", "Data Lineage", "Data Sharing Protocols", "MosaicML", "Model Serving", "Data Architecture", "DuckDB"] +keywords = ["AI", "Databricks", "Data Governance", "Data Engineering", "Artificial Intelligence", "Data Surrender Trap", "Data Sharing", "Data Security", "Data Privacy", "Open Standards", "Zero-Copy Sharing", "Google Cloud", "AWS", "Microsoft Azure", "Snowflake", "Data Residency", "Data Compliance", "Data Strategy", "Iceberg", "Delta Lake", "Apache Ranger", "Open Policy Agent", "Unity Catalog", "Data Lineage", "Data Sharing Protocols", "MosaicML", "Model Serving", "Data Architecture", "DuckDB", "Lakekeeper"] +++ ![The Data Surrender Trap](TheDataSurrenderTrap.svg) @@ -21,6 +21,7 @@ Handing raw customer data to a third party introduces two long-term headaches: 1. Governance and compliance risk – once data leaves your perimeter, you lose direct control over how long it’s stored, where it resides, and who can see it. A single mis-configuration or model-training clause could violate GDPR, HIPAA, or internal policy. 2. Technical debt – the day you need to swap providers, migrate regions, or delete a customer record, you discover tight coupling in schemas, pipelines, and security controls that were never designed for portability. +3. Technical debt - having to synchronize data between multiple vendors and your own systems, which can lead to data inconsistencies and increased complexity. ## Best practices: bring the AI to the data, not the data to the AI @@ -45,7 +46,7 @@ Before we look at any vendor implementation, it helps to know the building-block | Layer | Open standard | Why it matters | | --- | --- | --- | | Table formats | Apache Iceberg, Delta Lake, Apache Hudi, Parquet | Column-oriented, ACID-capable tables that sit in ordinary cloud storage and are readable by engines like Spark, Trino, Flink, etc. Iceberg’s spec is fully open, so any vendor can implement it—preventing lock-in and enabling multi-cloud lakes. | -| Governance / access control | Apache Ranger, Open Policy Agent, Unity Catalog | Centralize table/row/column policies, data masking, and audit logs across dozens of engines and clouds—without embedding rules in every service. Ranger policies even support dynamic row-level filters. | +| Governance / access control | Apache Ranger, Open Policy Agent, Unity Catalog, Lakekeeper | Centralize table/row/column policies, data masking, and audit logs across dozens of engines and clouds—without embedding rules in every service. Ranger policies even support dynamic row-level filters. | | Data lineage | OpenLineage | A vendor-neutral API for emitting and collecting lineage events from Spark, Airflow, dbt, BigQuery, and more. Lets you trace every model back to the exact inputs that produced it. | | Zero-copy data sharing | Delta Sharing (REST), Iceberg REST Catalog, Arrow Flight SQL | Instead of emailing CSVs, expose live tables through open protocols. Recipients query directly—Spark, Pandas, BI tools—while you keep full revocation and audit control. Delta Sharing is the first open REST protocol for this purpose; Iceberg’s REST catalog spec and Arrow Flight do the same for metadata and high-speed transport. | @@ -109,7 +110,7 @@ Each step below tightens control, reduces copies, and shows how to give an exter | --- | --- | --- | | Inventory & classify | | You can’t apply least-privilege sharing if you don’t know what’s sensitive. | | Land everything in open, governed tables | | Open formats + immutable history make later audits and deletions possible. | -| Switch on a unified catalog | | One policy engine ≫ dozens of per-tool ACLs. | +| Switch on a unified catalog | | One policy engine ≫ dozens of per-tool ACLs. | | Harden the perimeter | | Keeps “shadow ETL” from copying data out the side door. | | Safely share with an external AI vendor (zero-copy) |
  1. Minimise first – aggregate, pseudonymise, or drop columns the vendor doesn’t need.
  2. Create a Share (Delta Sharing / Iceberg REST / Arrow Flight):  
    • Grant only the filtered table or view.
    • Attach row-level filters & column masks.
    • Issue a time-boxed bearer token (7-, 30-, or 90-day TTL) and pin it to the vendor’s IP range. Databricks DocumentationDatabricks
  3. Contract & controls – DPA, usage policy, no onward sharing.
  4. Monitor – streaming audit of every query; set alerts for unusually large scans.
  5. Revoke or rotate the token the moment the engagement ends (one CLI/API call).
| Zero-copy protocols let the vendor query live tables without replicating them. Instant revocation closes the door the second you’re done. | | Move internal ML pipelines onto the platform | | No more exporting giant CSVs to Jupyter on someone’s laptop. |