From 3cf0e816e68d72159b06f9a702fc9871327db366 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 12 Jun 2026 13:23:43 +0200 Subject: [PATCH 1/8] build: consolidate Rust crates into a Cargo workspace + extract native-common Move the standalone `native` crate into a root Cargo workspace and extract shared JNI plumbing (error->exception mapping, Tokio runtime singleton, StreamingReader) into a new `datafusion-jni-common` crate under `native-common/`. `native/src/errors.rs` moves to `native-common/src/errors.rs`; the nine native modules now import error/runtime helpers from `datafusion_jni_common`. Build glue follows: single root `Cargo.lock`, `.cargo/config.toml` redirects output to `rust-target/`, Makefile/CI/poms updated to build `--workspace` and target `-p datafusion-jni`. Core javadoc build commands updated to match. Pure refactor; no behavior change. First of a 6-PR stack splitting the Spark DataSource V2 connector work. Co-Authored-By: Claude Opus 4.8 (1M context) --- .cargo/config.toml | 21 ++ .github/workflows/build.yml | 4 +- .github/workflows/lint.yml | 8 +- .gitignore | 1 + native/Cargo.lock => Cargo.lock | 244 +++++++++--------- Cargo.toml | 48 ++++ Makefile | 10 +- core/pom.xml | 4 +- .../org/apache/datafusion/SessionContext.java | 11 +- .../SessionContextRuntimeStatsTest.java | 2 +- .../SessionContextSubstraitTest.java | 2 +- docs/source/contributor-guide/development.md | 21 +- .../updating-datafusion-version.md | 10 +- native-common/Cargo.toml | 35 +++ {native => native-common}/src/errors.rs | 7 +- native-common/src/lib.rs | 104 ++++++++ native/Cargo.toml | 43 ++- native/src/arrow.rs | 2 +- native/src/avro.rs | 2 +- native/src/cache_manager.rs | 2 +- native/src/csv.rs | 2 +- native/src/json.rs | 2 +- native/src/lib.rs | 78 +----- native/src/object_store.rs | 2 +- native/src/proto.rs | 2 +- native/src/runtime_metrics.rs | 6 +- native/src/schema.rs | 2 +- pom.xml | 17 +- 28 files changed, 444 insertions(+), 248 deletions(-) create mode 100644 .cargo/config.toml rename native/Cargo.lock => Cargo.lock (95%) create mode 100644 Cargo.toml create mode 100644 native-common/Cargo.toml rename {native => native-common}/src/errors.rs (97%) create mode 100644 native-common/src/lib.rs diff --git a/.cargo/config.toml b/.cargo/config.toml new file mode 100644 index 0000000..d7e0ee2 --- /dev/null +++ b/.cargo/config.toml @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Keep Cargo's workspace output out of `target/` so `mvn clean` (which deletes +# the root `target/`) does not nuke the Rust build cache. +[build] +target-dir = "rust-target" diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c5db936..da8e65a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -83,8 +83,8 @@ jobs: path: | ~/.cargo/registry ~/.cargo/git - native/target - key: ${{ runner.os }}-cargo-${{ hashFiles('native/Cargo.lock') }} + rust-target + key: ${{ runner.os }}-cargo-${{ hashFiles('Cargo.lock') }} restore-keys: ${{ runner.os }}-cargo- - name: Build native and run tests diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 4cf628f..952bf34 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -54,7 +54,7 @@ jobs: run: ./mvnw -q spotless:check - name: Check Rust formatting - run: cd native && cargo fmt --all -- --check + run: cargo fmt --all -- --check clippy: name: Clippy @@ -81,9 +81,9 @@ jobs: path: | ~/.cargo/registry ~/.cargo/git - native/target - key: ${{ runner.os }}-clippy-${{ hashFiles('native/Cargo.lock') }} + rust-target + key: ${{ runner.os }}-clippy-${{ hashFiles('Cargo.lock') }} restore-keys: ${{ runner.os }}-clippy- - name: Run clippy - run: cd native && cargo clippy --all-targets -- -D warnings + run: cargo clippy --workspace --all-targets -- -D warnings diff --git a/.gitignore b/.gitignore index 719a2a4..25c9216 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ target/ +rust-target/ *.class .idea/ .vscode/ diff --git a/native/Cargo.lock b/Cargo.lock similarity index 95% rename from native/Cargo.lock rename to Cargo.lock index 96d2f9d..dbbfcde 100644 --- a/native/Cargo.lock +++ b/Cargo.lock @@ -98,9 +98,9 @@ dependencies = [ [[package]] name = "ar_archive_writer" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7eb93bbb63b9c227414f6eb3a0adfddca591a8ce1e9b60661bb08969b87e340b" +checksum = "4087686b4b0a3427190bae57a1d9a478dbb2d40c5dc1bd6e2b6d797913bdd348" dependencies = [ "object", ] @@ -119,9 +119,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "607e64bb911ee4f90483e044fe78f175989148c2892e659a2cd25429e782ec54" +checksum = "378530e55cd479eda3c14eb345310799717e6f76d0c332041e8487022166b471" dependencies = [ "arrow-arith", "arrow-array", @@ -140,9 +140,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e754319ed8a85d817fe7adf183227e0b5308b82790a737b426c1124626b48118" +checksum = "a0ab212d2c1886e802f51c5212d78ebbcbb0bec980fff9dadc1eb8d45cd0b738" dependencies = [ "arrow-array", "arrow-buffer", @@ -154,9 +154,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "841321891f247aa86c6112c80d83d89cb36e0addd020fa2425085b8eb6c3f579" +checksum = "cfd33d3e92f207444098c75b42de99d329562be0cf686b307b097cc52b4e999e" dependencies = [ "ahash", "arrow-buffer", @@ -173,9 +173,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f955dfb73fae000425f49c8226d2044dab60fb7ad4af1e24f961756354d996c9" +checksum = "0c6cd424c2693bcdbc150d843dc9d4d137dd2de4782ce6df491ad11a3a0416c0" dependencies = [ "bytes", "half", @@ -185,9 +185,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca5e686972523798f76bef355145bc1ae25a84c731e650268d31ab763c701663" +checksum = "4c5aefb56a2c02e9e2b30746241058b85f8983f0fcff2ba0c6d09006e1cded7f" dependencies = [ "arrow-array", "arrow-buffer", @@ -207,9 +207,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86c276756867fc8186ec380c72c290e6e3b23a1d4fb05df6b1d62d2e62666d48" +checksum = "e94e8cf7e517657a52b91ea1263acf38c4ca62a84655d72458a3359b12ab97de" dependencies = [ "arrow-array", "arrow-cast", @@ -222,9 +222,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db3b5846209775b6dc8056d77ff9a032b27043383dd5488abd0b663e265b9373" +checksum = "3c88210023a2bfee1896af366309a3028fc3bcbd6515fa29a7990ee1baa08ee0" dependencies = [ "arrow-buffer", "arrow-schema", @@ -235,9 +235,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd8907ddd8f9fbabf91ec2c85c1d81fe2874e336d2443eb36373595e28b98dd5" +checksum = "238438f0834483703d88896db6fe5a7138b2230debc31b34c0336c2996e3c64f" dependencies = [ "arrow-array", "arrow-buffer", @@ -251,9 +251,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4518c59acc501f10d7dcae397fe12b8db3d81bc7de94456f8a58f9165d6f502" +checksum = "205ca2119e6d679d5c133c6f30e68f027738d95ed948cf77677ea69c7800036b" dependencies = [ "arrow-array", "arrow-buffer", @@ -276,9 +276,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efa70d9d6b1356f1fb9f1f651b84a725b7e0abb93f188cf7d31f14abfa2f2e6f" +checksum = "1bffd8fd2579286a5d63bac898159873e5094a79009940bcb42bbfce4f19f1d0" dependencies = [ "arrow-array", "arrow-buffer", @@ -289,9 +289,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "faec88a945338192beffbbd4be0def70135422930caa244ac3cec0cd213b26b4" +checksum = "bab5994731204603c73ba69267616c50f80780774c6bb0476f1f830625115e0c" dependencies = [ "arrow-array", "arrow-buffer", @@ -302,9 +302,9 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18aa020f6bc8e5201dcd2d4b7f98c68f8a410ef37128263243e6ff2a47a67d4f" +checksum = "f633dbfdf39c039ada1bf9e34c694816eb71fbb7dc78f613993b7245e078a1ed" dependencies = [ "bitflags", "serde_core", @@ -313,9 +313,9 @@ dependencies = [ [[package]] name = "arrow-select" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a657ab5132e9c8ca3b24eb15a823d0ced38017fe3930ff50167466b02e2d592c" +checksum = "8cd065c54172ac787cf3f2f8d4107e0d3fdc26edba76fdf4f4cc170258942222" dependencies = [ "ahash", "arrow-array", @@ -327,9 +327,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6de2efbbd1a9f9780ceb8d1ff5d20421b35863b361e3386b4f571f1fc69fcb8" +checksum = "29dd7cda3ab9692f43a2e4acc444d760cc17b12bb6d8232ddf64e9bab7c06b42" dependencies = [ "arrow-array", "arrow-buffer", @@ -393,9 +393,9 @@ checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" [[package]] name = "autocfg" -version = "1.5.0" +version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +checksum = "f2032f911046de80f0a198e0901378627c33f59ea0ac00e363d481118bd70a53" [[package]] name = "base64" @@ -419,9 +419,9 @@ dependencies = [ [[package]] name = "bitflags" -version = "2.11.1" +version = "2.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" +checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8" [[package]] name = "blake2" @@ -457,9 +457,9 @@ dependencies = [ [[package]] name = "bon" -version = "3.9.1" +version = "3.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f47dbe92550676ee653353c310dfb9cf6ba17ee70396e1f7cf0a2020ad49b2fe" +checksum = "b2f04f6fef12d70d42a77b1433c9e0f065238479a6cefc4f5bab105e9873a3c3" dependencies = [ "bon-macros", "rustversion", @@ -467,9 +467,9 @@ dependencies = [ [[package]] name = "bon-macros" -version = "3.9.1" +version = "3.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "519bd3116aeeb42d5372c29d982d16d0170d3d4a5ed85fc7dd91642ffff3c67c" +checksum = "7d0bd4c2f75335ad98052a37efb54f428b492f64340257143b3429c8a508fa7b" dependencies = [ "darling", "ident_case", @@ -482,9 +482,9 @@ dependencies = [ [[package]] name = "brotli" -version = "8.0.2" +version = "8.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bd8b9603c7aa97359dbd97ecf258968c95f3adddd6db2f7e7a5bef101c84560" +checksum = "8119e4516436f5708bbc474a9d395bf12f1b5395e93a92a56e647ac3388c8610" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -493,9 +493,9 @@ dependencies = [ [[package]] name = "brotli-decompressor" -version = "5.0.0" +version = "5.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "874bb8112abecc98cbd6d81ea4fa7e94fb9449648c93cc89aa40c81c24d7de03" +checksum = "5962523e1b92ce1b5e793d9169b9943eece10d39f62550bc04bb605d75b94924" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -503,9 +503,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.20.2" +version = "3.20.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" +checksum = "72f5acc6cb2ba439de613abc23857ec3d78374d8ed5ac84e9d11336e87da8649" [[package]] name = "byteorder" @@ -530,9 +530,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.62" +version = "1.2.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1dce859f0832a7d088c4f1119888ab94ef4b5d6795d1ce05afb7fe159d79f98" +checksum = "556e016178bb5662a08681bbe0f00f8e17631781a4dfc8c45e466e4b185ec27f" dependencies = [ "find-msvc-tools", "jobserver", @@ -571,9 +571,9 @@ dependencies = [ [[package]] name = "chrono" -version = "0.4.44" +version = "0.4.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" +checksum = "1aa79e62e7697b8e29b513a68abacf485adcd1fe8284a4316c5ae868e6633327" dependencies = [ "iana-time-zone", "num-traits", @@ -789,9 +789,9 @@ dependencies = [ [[package]] name = "dashmap" -version = "6.1.0" +version = "6.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +checksum = "e6361d5c062261c78a176addb82d4c821ae42bed6089de0e12603cd25de2059c" dependencies = [ "cfg-if", "crossbeam-utils", @@ -1306,6 +1306,7 @@ dependencies = [ "arrow", "async-trait", "datafusion", + "datafusion-jni-common", "datafusion-proto", "datafusion-spark", "datafusion-substrait", @@ -1320,6 +1321,16 @@ dependencies = [ "url", ] +[[package]] +name = "datafusion-jni-common" +version = "0.1.0" +dependencies = [ + "datafusion", + "futures", + "jni", + "tokio", +] + [[package]] name = "datafusion-macros" version = "53.1.0" @@ -1607,9 +1618,9 @@ dependencies = [ [[package]] name = "displaydoc" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +checksum = "1ac70aa55017e108007fbaf5aa0f54b021c98f92ff8af59d42eda9da96e3dd4f" dependencies = [ "proc-macro2", "quote", @@ -1624,9 +1635,9 @@ checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" [[package]] name = "either" -version = "1.15.0" +version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e" [[package]] name = "equivalent" @@ -1932,9 +1943,9 @@ checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" [[package]] name = "http" -version = "1.4.0" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a" +checksum = "6970f50e31d6fc17d3fa27329444bfa74e196cf62e95052a3f6fee181dba6425" dependencies = [ "bytes", "itoa", @@ -1977,9 +1988,9 @@ checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" [[package]] name = "hyper" -version = "1.9.0" +version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6299f016b246a94207e63da54dbe807655bf9e00044f73ded42c3ac5305fbcca" +checksum = "55281c53a1894c864990125767da440a4e630446785086f52523b20033b74498" dependencies = [ "atomic-waker", "bytes", @@ -2269,13 +2280,12 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.98" +version = "0.3.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67df7112613f8bfd9150013a0314e196f4800d3201ae742489d999db2f979f08" +checksum = "f2025f20d7a4fa7785846e7b63d10a76d3f1cee98ee5cb79ea59703f95e42162" dependencies = [ "cfg-if", "futures-util", - "once_cell", "wasm-bindgen", ] @@ -2344,9 +2354,9 @@ dependencies = [ [[package]] name = "libbz2-rs-sys" -version = "0.2.3" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3a6a8c165077efc8f3a971534c50ea6a1a18b329ef4a66e897a7e3a1494565f" +checksum = "34b357333733e8260735ba5894eb928c02ecc69c78715f01a8019e7fa7f2db4c" [[package]] name = "libc" @@ -2403,9 +2413,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.29" +version = "0.4.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +checksum = "953f07c43838f8e6f9758cab68bf5bed85465e7587ebe0b823f1bcd81978ad3a" [[package]] name = "lru-slab" @@ -2434,9 +2444,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.8.0" +version = "2.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8" [[package]] name = "miniz_oxide" @@ -2450,9 +2460,9 @@ dependencies = [ [[package]] name = "mio" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50b7e5b27aa02a74bac8c3f23f448f8d87ff11f92d3aac1a6ed369ee08cc56c1" +checksum = "02bd0af71c67b473010cbbc60715ee815645a4dc942899111f494b4b737d6fda" dependencies = [ "libc", "wasi", @@ -2598,9 +2608,9 @@ dependencies = [ [[package]] name = "parquet" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43d7efd3052f7d6ef601085559a246bc991e9a8cc77e02753737df6322ce35f1" +checksum = "5dafa7d01085b62a47dd0c1829550a0a36710ea9c4fe358a05a85477cec8a908" dependencies = [ "ahash", "arrow-array", @@ -2762,9 +2772,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568" +checksum = "528ac67416ff8646872a3c02cad9cc4ee5dc9f9540c9b10771855c95cb2e5ae1" dependencies = [ "bytes", "prost-derive", @@ -2772,9 +2782,9 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" +checksum = "03da047801ff44bb6a4d407d4860c05fd70bb81714e6b2f3812603d5b145b042" dependencies = [ "heck", "itertools", @@ -2791,9 +2801,9 @@ dependencies = [ [[package]] name = "prost-derive" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" +checksum = "b570b25f7617e43d59005d0990ccb79e950a423952cea19671b7a876da390adf" dependencies = [ "anyhow", "itertools", @@ -2804,9 +2814,9 @@ dependencies = [ [[package]] name = "prost-types" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8991c4cbdb8bc5b11f0b074ffe286c30e523de90fee5ba8132f1399f23cb3dd7" +checksum = "f94967dc7688f3054c7fac87473ffae4cc4c3904800e2d9f5b857246d8963b0a" dependencies = [ "prost", ] @@ -3063,9 +3073,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.12.3" +version = "1.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +checksum = "f1292b7759ae1cb9ec195452d1390a074f0cd8541ab7a5a8c31cd6db45d4a6ba" dependencies = [ "aho-corasick", "memchr", @@ -3092,9 +3102,9 @@ checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973" [[package]] name = "regex-syntax" -version = "0.8.10" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" +checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4" [[package]] name = "regress" @@ -3206,9 +3216,9 @@ dependencies = [ [[package]] name = "rustls-native-certs" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63" +checksum = "dab5152771c58876a2146916e53e35057e1a4dfa2b9df0f0305b07f611fdea4d" dependencies = [ "openssl-probe", "rustls-pki-types", @@ -3389,9 +3399,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.149" +version = "1.0.150" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9" dependencies = [ "itoa", "memchr", @@ -3461,9 +3471,9 @@ dependencies = [ [[package]] name = "shlex" -version = "1.3.0" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +checksum = "f8fadd59c855ef2080decdef8ff161eb6661b86933c9d82e5ba29dc602a55aba" [[package]] name = "simd-adler32" @@ -3503,9 +3513,9 @@ checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" [[package]] name = "socket2" -version = "0.6.3" +version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" +checksum = "52d1cfed4120b4d927bf7c0f86d2087a4a7d6027c906d9f9d525a80573b9be51" dependencies = [ "libc", "windows-sys 0.61.2", @@ -3900,9 +3910,9 @@ checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c" [[package]] name = "typenum" -version = "1.20.0" +version = "1.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de" +checksum = "b6f5e870be6c3b371b77fe0ee0bafb859fa4964b4404c27de1d380043c4dda20" [[package]] name = "typify" @@ -3959,9 +3969,9 @@ checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" [[package]] name = "unicode-segmentation" -version = "1.13.2" +version = "1.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9629274872b2bfaf8d66f5f15725007f635594914870f65218920345aa11aa8c" +checksum = "c6f5d3c3b1bf09027a88a6bc961fc00497d651009560b5463668dc81b0fa87a8" [[package]] name = "unicode-width" @@ -4007,9 +4017,9 @@ checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" [[package]] name = "uuid" -version = "1.23.1" +version = "1.23.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddd74a9687298c6858e9b88ec8935ec45d22e8fd5e6394fa1bd4e99a87789c76" +checksum = "144d6b123cef80b301b8f72a9e2ca4370ddec21950d0a103dd22c437006d2db7" dependencies = [ "getrandom 0.4.2", "js-sys", @@ -4068,9 +4078,9 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.121" +version = "0.2.123" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49ace1d07c165b0864824eee619580c4689389afa9dc9ed3a4c75040d82e6790" +checksum = "a254a4b10c19a76f09a27640e7ffbf9bc30bf67e16a3bf28aaefa4920fe81563" dependencies = [ "cfg-if", "once_cell", @@ -4081,9 +4091,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.71" +version = "0.4.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96492d0d3ffba25305a7dc88720d250b1401d7edca02cc3bcd50633b424673b8" +checksum = "54568702fabf5d4849ce2b90fadfa64168a097eaf4b351ce9df8b687a0086aaf" dependencies = [ "js-sys", "wasm-bindgen", @@ -4091,9 +4101,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.121" +version = "0.2.123" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e68e6f4afd367a562002c05637acb8578ff2dea1943df76afb9e83d177c8578" +checksum = "24a40fc75b0ec6f3746ceb10d36f53a93dcd68a93b11b6445983945d79eba0dc" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -4101,9 +4111,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.121" +version = "0.2.123" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d95a9ec35c64b2a7cb35d3fead40c4238d0940c86d107136999567a4703259f2" +checksum = "908f34bd9b9ce3d4caf07b72dfab63d61504d156856c6bd3cd87fa350cf3985b" dependencies = [ "bumpalo", "proc-macro2", @@ -4114,9 +4124,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.121" +version = "0.2.123" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4e0100b01e9f0d03189a92b96772a1fb998639d981193d7dbab487302513441" +checksum = "7acbf7616c27b194bbb550bf77ed0c2c3e5b7fd1260a93082b95fb7f47959b92" dependencies = [ "unicode-ident", ] @@ -4170,9 +4180,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.98" +version = "0.3.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b572dff8bcf38bad0fa19729c89bb5748b2b9b1d8be70cf90df697e3a8f32aa" +checksum = "6e0871acf327f283dc6da28a1696cdc64fb355ba9f935d052021fa77f35cce69" dependencies = [ "js-sys", "wasm-bindgen", @@ -4580,9 +4590,9 @@ checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" [[package]] name = "yoke" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca" +checksum = "709fe23a0424b6a435d82152b1bd3fdfb0833487d5fa90d05d42762a9891fef5" dependencies = [ "stable_deref_trait", "yoke-derive", @@ -4603,18 +4613,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.48" +version = "0.8.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" +checksum = "ce1022995ff5ff5d841ad7d994facc23098cd40152f2c1d11cd607c6f530653f" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.48" +version = "0.8.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" +checksum = "1ae7f38b72ec2a254e2b87ef277cf2cd4fb97cbebf944faa6f33354da0867930" dependencies = [ "proc-macro2", "quote", @@ -4623,9 +4633,9 @@ dependencies = [ [[package]] name = "zerofrom" -version = "0.1.7" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69faa1f2a1ea75661980b013019ed6687ed0e83d069bc1114e2cc74c6c04c4df" +checksum = "0ec05a11813ea801ff6d75110ad09cd0824ddba17dfe17128ea0d5f68e6c5272" dependencies = [ "zerofrom-derive", ] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..d7f98f7 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[workspace] +resolver = "2" +members = [ + "native", + "native-common", +] + +# Every dependency used by any workspace member is declared here so version +# bumps live in one place and the resolver picks a single version of each +# crate across the workspace. Members reference these via `{ workspace = true }` +# and add per-crate flags (optional, features, default-features) at the use +# site. +[workspace.dependencies] +arrow = { version = "58", features = ["ffi"] } +async-trait = "0.1" +datafusion = { version = "53.1.0" } +datafusion-proto = "53.1.0" +datafusion-spark = "53.1.0" +datafusion-substrait = "53.1.0" +futures = "0.3" +jni = "0.21" +# Pinned to the major DataFusion 53.1 pulls in transitively (0.13.x) so we +# share the same `dyn ObjectStore` vtable and don't double-link. +object_store = { version = "0.13", default-features = false } +prost = "0.14" +prost-build = "0.14" +protoc-bin-vendored = "3" +tokio = { version = "1", features = ["rt-multi-thread"] } +# Optional, cfg-gated. See `native/Cargo.toml` for the build-flag dance. +tokio-metrics = "0.5" +url = "2" diff --git a/Makefile b/Makefile index 6d9b0ae..d6bcf2c 100644 --- a/Makefile +++ b/Makefile @@ -20,14 +20,14 @@ all: native jvm native: - cd native && cargo build + cargo build --workspace -# Build the native crate with the `runtime-metrics` Cargo feature enabled. +# Build the JNI crate with the `runtime-metrics` Cargo feature enabled. # Requires `--cfg tokio_unstable` because tokio-metrics gates its API there. # Default `make native` does not pull this in; callers who need # SessionContext.runtimeStats() pick this target explicitly. native-runtime-metrics: - cd native && RUSTFLAGS="--cfg tokio_unstable" cargo build --features runtime-metrics + RUSTFLAGS="--cfg tokio_unstable" cargo build -p datafusion-jni --features runtime-metrics jvm: ./mvnw package -DskipTests @@ -39,10 +39,10 @@ test: native # `:check` form inline in .github/workflows/lint.yml. format: ./mvnw -q spotless:apply - cd native && cargo fmt --all + cargo fmt --all clean: - cd native && cargo clean + cargo clean ./mvnw clean tpch-data: diff --git a/core/pom.xml b/core/pom.xml index 5ddf107..1e25736 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -102,8 +102,8 @@ under the License. - + value="${maven.multiModuleProjectDirectory}/rust-target/${datafusion.native.profile}/${datafusion.lib.filename}"/> + diff --git a/core/src/main/java/org/apache/datafusion/SessionContext.java b/core/src/main/java/org/apache/datafusion/SessionContext.java index ec0bd85..b68cda5 100644 --- a/core/src/main/java/org/apache/datafusion/SessionContext.java +++ b/core/src/main/java/org/apache/datafusion/SessionContext.java @@ -113,10 +113,11 @@ public DataFrame fromProto(byte[] planBytes) { * other Substrait-emitting tool — and hand them to DataFusion without round-tripping through SQL. * *

Substrait support is gated behind the {@code substrait} Cargo feature on the native crate - * and is off by default. Rebuild the native crate with {@code cargo build - * --features substrait} (or {@code cargo build --features substrait,protoc} for hermetic builds - * that vendor {@code protoc} via {@code cmake}) to enable it. If invoked against a native binary - * built without the feature, this method throws {@link RuntimeException} pointing at the flag. + * and is off by default. Rebuild the native crate with {@code cargo build -p + * datafusion-jni --features substrait} (or {@code ... --features substrait,protoc} for hermetic + * builds that vendor {@code protoc} via {@code cmake}) to enable it. If invoked against a native + * binary built without the feature, this method throws {@link RuntimeException} pointing at the + * flag. * * @throws IllegalArgumentException if {@code planBytes} is {@code null}. * @throws IllegalStateException if this context is closed. @@ -183,7 +184,7 @@ public MemoryUsage memoryUsage() { * Rebuild with: * *

{@code
-   * RUSTFLAGS="--cfg tokio_unstable" cargo build --features runtime-metrics
+   * RUSTFLAGS="--cfg tokio_unstable" cargo build -p datafusion-jni --features runtime-metrics
    * }
* *

If invoked against a native binary built without the feature, this method throws {@link diff --git a/core/src/test/java/org/apache/datafusion/SessionContextRuntimeStatsTest.java b/core/src/test/java/org/apache/datafusion/SessionContextRuntimeStatsTest.java index 120d179..d567275 100644 --- a/core/src/test/java/org/apache/datafusion/SessionContextRuntimeStatsTest.java +++ b/core/src/test/java/org/apache/datafusion/SessionContextRuntimeStatsTest.java @@ -37,7 +37,7 @@ * #checkFeatureEnabled}. Run * *

{@code
- * (cd native && RUSTFLAGS="--cfg tokio_unstable" cargo build --features runtime-metrics)
+ * RUSTFLAGS="--cfg tokio_unstable" cargo build -p datafusion-jni --features runtime-metrics
  * }
* * before {@code ./mvnw test} to exercise this class. diff --git a/core/src/test/java/org/apache/datafusion/SessionContextSubstraitTest.java b/core/src/test/java/org/apache/datafusion/SessionContextSubstraitTest.java index 34db3b5..a2cfb0a 100644 --- a/core/src/test/java/org/apache/datafusion/SessionContextSubstraitTest.java +++ b/core/src/test/java/org/apache/datafusion/SessionContextSubstraitTest.java @@ -50,7 +50,7 @@ * *

The {@code substrait} Cargo feature is off by default in {@code native/Cargo.toml}; if the * native crate was built without it, every test here is skipped (see {@link #checkFeatureEnabled}). - * Run {@code (cd native && cargo build --features substrait)} before {@code ./mvnw test} to + * Run {@code cargo build -p datafusion-jni --features substrait} before {@code ./mvnw test} to * exercise this class. */ class SessionContextSubstraitTest { diff --git a/docs/source/contributor-guide/development.md b/docs/source/contributor-guide/development.md index 984d77c..fdb00f4 100644 --- a/docs/source/contributor-guide/development.md +++ b/docs/source/contributor-guide/development.md @@ -42,7 +42,7 @@ This builds the native Rust crate and runs the JUnit tests. The steps can be run individually: ```sh -cd native && cargo build +cargo build --workspace ./mvnw test ``` @@ -74,14 +74,25 @@ disk space. The repository is a multi-module Maven build: -- `pom.xml` — parent POM declaring the `core` and `examples` modules and - shared plugin/dependency versions. +- `Cargo.toml` — Rust workspace root declaring the three crate members + (`native`, `native-common`, `examples/native`, `spark/bridge`) and `[workspace.dependencies]` + that pin shared versions in one place. Cargo writes artifacts to + `rust-target/` (overridden in `.cargo/config.toml`) so `mvn clean` at the + repo root does not nuke the Rust build cache. +- `pom.xml` — parent POM declaring the `core`, `spark`, and `examples` + modules and shared plugin/dependency versions. - `core/` — `datafusion-java` library module (Java sources, tests, and generated protobuf classes). +- `spark/` — `datafusion-java-spark` Spark DataSource V2 connector + (Scala + Java, pure JVM) and its `spark/bridge/` Rust SDK crate + (`datafusion-spark-bridge`: widening, scan machinery, `export_bridge!`). - `examples/` — `datafusion-java-examples` module containing runnable examples that depend on the library; built alongside the library so they - cannot fall out of sync with the API. -- `native/` — Rust crate (JNI + Arrow C Data Interface). + cannot fall out of sync with the API. Includes `examples/native/`, a + small `export_bridge!` cdylib used by the Spark connector demo + (`ExampleBridgeProviderFactory` + the pyspark script under + `examples/python/`). +- `native/` — `datafusion-jni` Rust crate (JNI + Arrow C Data Interface). - `proto/` — Protobuf definitions shared between Java and Rust. - `Makefile` — top-level build orchestration (`make test`, `make format`, `make tpch-data`). diff --git a/docs/source/contributor-guide/updating-datafusion-version.md b/docs/source/contributor-guide/updating-datafusion-version.md index 56d50dc..ef6cd10 100644 --- a/docs/source/contributor-guide/updating-datafusion-version.md +++ b/docs/source/contributor-guide/updating-datafusion-version.md @@ -21,7 +21,9 @@ under the License. Three things must move together when bumping DataFusion: -1. `native/Cargo.toml` — the `datafusion` crate dependency. +1. `Cargo.toml` (workspace root) — the `datafusion`, `datafusion-ffi`, + `datafusion-proto`, and `datafusion-substrait` entries in + `[workspace.dependencies]`. Members inherit from there. 2. `pom.xml` — the `` Maven property. **Must equal the Cargo version**; a mismatch means JVM-built protobuf plans won't deserialize on the native side. @@ -32,9 +34,9 @@ Three things must move together when bumping DataFusion: ## Recipe ```sh -# 1. Bump the Cargo dep -$EDITOR native/Cargo.toml # set datafusion = "" -(cd native && cargo update -p datafusion) +# 1. Bump the workspace dep +$EDITOR Cargo.toml # set datafusion = "" in [workspace.dependencies] +cargo update -p datafusion # 2. Bump the Maven property to match $EDITOR pom.xml # set diff --git a/native-common/Cargo.toml b/native-common/Cargo.toml new file mode 100644 index 0000000..0a797b4 --- /dev/null +++ b/native-common/Cargo.toml @@ -0,0 +1,35 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "datafusion-jni-common" +version = "0.1.0" +edition = "2021" +publish = false + +[features] +# `datafusion-jni` builds DataFusion with `avro`, which adds the +# `DataFusionError::AvroError` variant our classifier maps to IoException. +# Feature-forwarded so consumers that don't read Avro (the Spark helper) +# don't pull the apache-avro stack into their cdylib. +avro = ["datafusion/avro"] + +[dependencies] +datafusion = { workspace = true } +futures = { workspace = true } +jni = { workspace = true } +tokio = { workspace = true } diff --git a/native/src/errors.rs b/native-common/src/errors.rs similarity index 97% rename from native/src/errors.rs rename to native-common/src/errors.rs index d926544..caa2540 100644 --- a/native/src/errors.rs +++ b/native-common/src/errors.rs @@ -96,8 +96,11 @@ fn classify(err: &DataFusionError) -> &'static str { } DataFusionError::IoError(_) | DataFusionError::ObjectStore(_) - | DataFusionError::ParquetError(_) - | DataFusionError::AvroError(_) => "org/apache/datafusion/IoException", + | DataFusionError::ParquetError(_) => "org/apache/datafusion/IoException", + // The AvroError variant only exists when DataFusion is built with its + // `avro` feature, forwarded by this crate's own `avro` feature. + #[cfg(feature = "avro")] + DataFusionError::AvroError(_) => "org/apache/datafusion/IoException", // ArrowError is a 21-variant grab bag -- only some of those variants // are actually IO-shaped. DivideByZero / ArithmeticOverflow / Compute // / Cast / InvalidArgument / Memory etc. are execution-time failures diff --git a/native-common/src/lib.rs b/native-common/src/lib.rs new file mode 100644 index 0000000..f143d43 --- /dev/null +++ b/native-common/src/lib.rs @@ -0,0 +1,104 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! JNI plumbing shared by this workspace's native crates (`datafusion-jni` +//! and `datafusion-spark-bridge`, and through the latter every bridge +//! cdylib): the error-to-Java-exception mapping, the per-cdylib Tokio +//! runtime singleton, and the async-stream-to-`FFI_ArrowArrayStream` +//! bridge. +//! +//! Each cdylib statically links its own copy of this rlib, so [`runtime`] is +//! a per-cdylib singleton -- exactly the behaviour each crate had when this +//! code lived inline. Nothing here is exported with `#[no_mangle]`, so +//! linking this crate into several cdylibs loaded in one JVM cannot collide. + +pub mod errors; + +use std::panic::{catch_unwind, AssertUnwindSafe}; +use std::sync::OnceLock; + +use datafusion::arrow::array::RecordBatch; +use datafusion::arrow::datatypes::SchemaRef; +use datafusion::arrow::error::ArrowError; +use datafusion::arrow::record_batch::RecordBatchReader; +use datafusion::execution::SendableRecordBatchStream; +use futures::StreamExt; +use tokio::runtime::{Handle, Runtime}; + +static RT: OnceLock = OnceLock::new(); + +/// The cdylib-wide Tokio runtime. +pub fn runtime() -> &'static Runtime { + runtime_with_init(|_| {}) +} + +/// Same singleton as [`runtime`], with a hook that runs exactly once, when +/// the runtime is created. `datafusion-jni` uses it to install its +/// runtime-metrics accumulator so the sampling baseline coincides with +/// runtime start; every later call (either entry point) returns the existing +/// runtime without invoking the hook. +pub fn runtime_with_init(init: impl FnOnce(&Handle)) -> &'static Runtime { + RT.get_or_init(|| { + let rt = Runtime::new().expect("failed to create Tokio runtime"); + init(rt.handle()); + rt + }) +} + +/// Bridges DataFusion's async [`SendableRecordBatchStream`] to the synchronous +/// [`RecordBatchReader`] interface that `FFI_ArrowArrayStream` (and therefore +/// the Java `ArrowReader`) consumes. Each call to `next()` drives one +/// `runtime().block_on(stream.next())`, so memory pressure stays bounded by the +/// executor pipeline plus a single in-flight batch. +pub struct StreamingReader { + pub schema: SchemaRef, + pub stream: SendableRecordBatchStream, +} + +impl Iterator for StreamingReader { + type Item = Result; + + fn next(&mut self) -> Option { + // Arrow's C ABI invokes this iterator through FFI_ArrowArrayStream's + // vtable, outside the JNI handler's try_unwrap_or_throw guard. A panic + // here (buggy UDF, arrow cast that panics, runtime poison) would + // unwind across C/FFI -- undefined behaviour. Catch it and surface as + // an ArrowError so the Java side sees a normal exception instead. + let next = catch_unwind(AssertUnwindSafe(|| runtime().block_on(self.stream.next()))); + match next { + Ok(item) => item.map(|r| r.map_err(|e| ArrowError::ExternalError(Box::new(e)))), + Err(panic) => { + let msg = if let Some(s) = panic.downcast_ref::() { + s.clone() + } else if let Some(s) = panic.downcast_ref::<&str>() { + (*s).to_string() + } else { + "rust panic with non-string payload".to_string() + }; + Some(Err(ArrowError::ExternalError( + format!("panic in DataFrame stream: {msg}").into(), + ))) + } + } + } +} + +impl RecordBatchReader for StreamingReader { + fn schema(&self) -> SchemaRef { + self.schema.clone() + } +} diff --git a/native/Cargo.toml b/native/Cargo.toml index 0362ae6..507f0a9 100644 --- a/native/Cargo.toml +++ b/native/Cargo.toml @@ -23,8 +23,8 @@ publish = false [lib] # `rlib` alongside `cdylib` so `cargo test` has a Rust-level harness for -# native-only invariants (e.g. error-classification routing through wrapped -# DataFusionError chains). The `cdylib` is still the artifact the JVM loads. +# native-only invariants (the error-classification tests now live in +# `datafusion-jni-common`). The `cdylib` is still the artifact the JVM loads. crate-type = ["cdylib", "rlib"] [features] @@ -75,28 +75,27 @@ runtime-metrics = ["dep:tokio-metrics"] spark = ["dep:datafusion-spark"] [dependencies] -arrow = { version = "58", features = ["ffi"] } -async-trait = "0.1" -datafusion = { version = "53.1.0", features = ["avro"] } -datafusion-proto = "53.1.0" +arrow = { workspace = true } +async-trait = { workspace = true } +datafusion = { workspace = true, features = ["avro"] } +# Shared JNI plumbing (error->exception mapping, runtime singleton, +# StreamingReader). `avro` keeps the classifier's AvroError->IoException arm +# in sync with the `avro` feature on `datafusion` above. +datafusion-jni-common = { path = "../native-common", features = ["avro"] } +datafusion-proto = { workspace = true } # Apache Spark-compatible functions + expression planners. Optional and # gated behind the `spark` feature (in the default set). The `core` feature # of the crate is what exposes `SessionStateBuilderSpark`. -datafusion-spark = { version = "53.1.0", features = ["core"], optional = true } -datafusion-substrait = { version = "53.1.0", optional = true } -futures = "0.3" -jni = "0.21" -# Pin to the same major as DataFusion 53.1 pulls in transitively (0.13.x) -# so we share the same `dyn ObjectStore` vtable and don't double-link. -object_store = { version = "0.13", default-features = false } -prost = "0.14" -tokio = { version = "1", features = ["rt-multi-thread"] } -# Tokio runtime metrics. Optional + cfg-gated: this crate's API surface lives -# behind `--cfg tokio_unstable`, so enabling the `runtime-metrics` feature also -# requires the caller to set `RUSTFLAGS="--cfg tokio_unstable"` at build time. -tokio-metrics = { version = "0.5", optional = true } -url = "2" +datafusion-spark = { workspace = true, features = ["core"], optional = true } +datafusion-substrait = { workspace = true, optional = true } +futures = { workspace = true } +jni = { workspace = true } +object_store = { workspace = true } +prost = { workspace = true } +tokio = { workspace = true } +tokio-metrics = { workspace = true, optional = true } +url = { workspace = true } [build-dependencies] -prost-build = "0.14" -protoc-bin-vendored = "3" +prost-build = { workspace = true } +protoc-bin-vendored = { workspace = true } diff --git a/native/src/arrow.rs b/native/src/arrow.rs index 2bbe7b0..67e5caf 100644 --- a/native/src/arrow.rs +++ b/native/src/arrow.rs @@ -23,10 +23,10 @@ use jni::sys::jlong; use jni::JNIEnv; use prost::Message; -use crate::errors::{try_unwrap_or_throw, JniResult}; use crate::proto_gen::ArrowReadOptionsProto; use crate::runtime; use crate::schema::decode_optional_schema; +use datafusion_jni_common::errors::{try_unwrap_or_throw, JniResult}; fn with_arrow_options( env: &mut JNIEnv, diff --git a/native/src/avro.rs b/native/src/avro.rs index 85d4a07..257ae32 100644 --- a/native/src/avro.rs +++ b/native/src/avro.rs @@ -23,10 +23,10 @@ use jni::sys::jlong; use jni::JNIEnv; use prost::Message; -use crate::errors::{try_unwrap_or_throw, JniResult}; use crate::proto_gen::AvroReadOptionsProto; use crate::runtime; use crate::schema::decode_optional_schema; +use datafusion_jni_common::errors::{try_unwrap_or_throw, JniResult}; fn with_avro_options( env: &mut JNIEnv, diff --git a/native/src/cache_manager.rs b/native/src/cache_manager.rs index 3b9e286..ec38dc8 100644 --- a/native/src/cache_manager.rs +++ b/native/src/cache_manager.rs @@ -34,8 +34,8 @@ use datafusion::execution::cache::cache_unit::{ }; use datafusion::execution::cache::DefaultListFilesCache; -use crate::errors::JniResult; use crate::proto_gen::CacheManagerOptionsProto; +use datafusion_jni_common::errors::JniResult; /// Build a [`CacheManagerConfig`] from the proto. Returns `Ok(None)` if the /// caller did not set any cache-manager field, so the JNI layer can skip the diff --git a/native/src/csv.rs b/native/src/csv.rs index 3ae4627..b79ed59 100644 --- a/native/src/csv.rs +++ b/native/src/csv.rs @@ -26,12 +26,12 @@ use jni::sys::jlong; use jni::JNIEnv; use prost::Message; -use crate::errors::{try_unwrap_or_throw, JniResult}; use crate::proto_gen::{ CsvReadOptionsProto, CsvWriteOptionsProto, FileCompressionType as ProtoFileCompressionType, }; use crate::runtime; use crate::schema::decode_optional_schema; +use datafusion_jni_common::errors::{try_unwrap_or_throw, JniResult}; fn with_csv_options( env: &mut JNIEnv, diff --git a/native/src/json.rs b/native/src/json.rs index 8eea32f..b87be78 100644 --- a/native/src/json.rs +++ b/native/src/json.rs @@ -27,12 +27,12 @@ use jni::sys::jlong; use jni::JNIEnv; use prost::Message; -use crate::errors::{try_unwrap_or_throw, JniResult}; use crate::proto_gen::{ FileCompressionType as ProtoFileCompressionType, JsonWriteOptionsProto, NdJsonReadOptionsProto, }; use crate::runtime; use crate::schema::decode_optional_schema; +use datafusion_jni_common::errors::{try_unwrap_or_throw, JniResult}; fn with_json_options( env: &mut JNIEnv, diff --git a/native/src/lib.rs b/native/src/lib.rs index 43161d2..56bef5d 100644 --- a/native/src/lib.rs +++ b/native/src/lib.rs @@ -19,7 +19,6 @@ mod arrow; mod avro; mod cache_manager; mod csv; -mod errors; mod jni_util; mod json; mod memory; @@ -34,16 +33,13 @@ pub(crate) mod proto_gen { include!(concat!(env!("OUT_DIR"), "/datafusion_java.rs")); } -use std::panic::{catch_unwind, AssertUnwindSafe}; use std::path::PathBuf; use std::sync::{Arc, OnceLock}; -use datafusion::arrow::array::RecordBatch; use datafusion::arrow::datatypes::SchemaRef; -use datafusion::arrow::error::ArrowError; use datafusion::arrow::ffi_stream::FFI_ArrowArrayStream; use datafusion::arrow::ipc::writer::StreamWriter; -use datafusion::arrow::record_batch::{RecordBatchIterator, RecordBatchReader}; +use datafusion::arrow::record_batch::RecordBatchIterator; use datafusion::common::{JoinType, UnnestOptions}; use datafusion::config::TableParquetOptions; use datafusion::dataframe::DataFrame; @@ -51,11 +47,9 @@ use datafusion::dataframe::DataFrameWriteOptions; use datafusion::error::DataFusionError; use datafusion::execution::disk_manager::{DiskManagerBuilder, DiskManagerMode}; use datafusion::execution::runtime_env::{RuntimeEnv, RuntimeEnvBuilder}; -use datafusion::execution::SendableRecordBatchStream; use datafusion::logical_expr::Expr; use datafusion::logical_expr::{col, Partitioning, ScalarUDF, Signature, SortExpr}; use datafusion::prelude::{ParquetReadOptions, SessionConfig, SessionContext}; -use futures::StreamExt; use jni::objects::{JBooleanArray, JByteArray, JClass, JObject, JObjectArray, JString}; use jni::sys::{jboolean, jbyte, jbyteArray, jint, jlong}; use jni::JNIEnv; @@ -63,7 +57,10 @@ use jni::JavaVM; use prost::Message; use tokio::runtime::Runtime; -use crate::errors::{try_unwrap_or_throw, JniResult}; +use datafusion_jni_common::errors::{try_unwrap_or_throw, JniResult}; +// Re-exported so sibling modules keep their crate-local `crate::StreamingReader` path. +pub(crate) use datafusion_jni_common::StreamingReader; + use crate::proto_gen::ParquetReadOptionsProto; use crate::proto_gen::SessionOptions; use crate::schema::decode_optional_schema; @@ -84,18 +81,15 @@ pub(crate) fn jvm() -> &'static JavaVM { } pub(crate) fn runtime() -> &'static Runtime { - static RT: OnceLock = OnceLock::new(); - RT.get_or_init(|| { - let rt = Runtime::new().expect("failed to create Tokio runtime"); - // Eagerly install the runtime-metrics accumulator (no-op when the - // `runtime-metrics` Cargo feature is off). Initialising here -- not - // lazily on the first `runtimeStats()` call -- means the - // RuntimeMonitor's sampling baseline coincides with runtime start, so - // poll/park/busy totals reflect activity from the first query onward - // rather than from the first observation. - crate::runtime_metrics::init(rt.handle()); - rt - }) + // The singleton itself lives in datafusion-jni-common (shared with the + // datafusion-spark-bridge SDK; each cdylib statically links its own + // copy, so the runtime stays per-library). The init hook eagerly installs the + // runtime-metrics accumulator (no-op when the `runtime-metrics` Cargo + // feature is off). Initialising here -- not lazily on the first + // `runtimeStats()` call -- means the RuntimeMonitor's sampling baseline + // coincides with runtime start, so poll/park/busy totals reflect activity + // from the first query onward rather than from the first observation. + datafusion_jni_common::runtime_with_init(crate::runtime_metrics::init) } /// Wrap the (already-built) `RuntimeEnvBuilder`'s memory pool with a @@ -324,50 +318,6 @@ pub extern "system" fn Java_org_apache_datafusion_DataFrame_collectDataFrame<'lo }) } -/// Bridges DataFusion's async [`SendableRecordBatchStream`] to the synchronous -/// [`RecordBatchReader`] interface that `FFI_ArrowArrayStream` (and therefore -/// the Java `ArrowReader`) consumes. Each call to `next()` drives one -/// `runtime().block_on(stream.next())`, so memory pressure stays bounded by the -/// executor pipeline plus a single in-flight batch. -struct StreamingReader { - schema: SchemaRef, - stream: SendableRecordBatchStream, -} - -impl Iterator for StreamingReader { - type Item = Result; - - fn next(&mut self) -> Option { - // Arrow's C ABI invokes this iterator through FFI_ArrowArrayStream's - // vtable, outside the JNI handler's try_unwrap_or_throw guard. A panic - // here (buggy UDF, arrow cast that panics, runtime poison) would - // unwind across C/FFI -- undefined behaviour. Catch it and surface as - // an ArrowError so the Java side sees a normal exception instead. - let next = catch_unwind(AssertUnwindSafe(|| runtime().block_on(self.stream.next()))); - match next { - Ok(item) => item.map(|r| r.map_err(|e| ArrowError::ExternalError(Box::new(e)))), - Err(panic) => { - let msg = if let Some(s) = panic.downcast_ref::() { - s.clone() - } else if let Some(s) = panic.downcast_ref::<&str>() { - (*s).to_string() - } else { - "rust panic with non-string payload".to_string() - }; - Some(Err(ArrowError::ExternalError( - format!("panic in DataFrame stream: {msg}").into(), - ))) - } - } - } -} - -impl RecordBatchReader for StreamingReader { - fn schema(&self) -> SchemaRef { - self.schema.clone() - } -} - #[no_mangle] pub extern "system" fn Java_org_apache_datafusion_DataFrame_executeStreamDataFrame<'local>( mut env: JNIEnv<'local>, diff --git a/native/src/object_store.rs b/native/src/object_store.rs index eefccf2..985d721 100644 --- a/native/src/object_store.rs +++ b/native/src/object_store.rs @@ -28,9 +28,9 @@ use std::sync::Arc; use datafusion::prelude::SessionContext; use url::Url; -use crate::errors::JniResult; use crate::proto_gen::object_store_registration::Backend; use crate::proto_gen::ObjectStoreRegistration; +use datafusion_jni_common::errors::JniResult; #[cfg(feature = "object-store-gcp")] use crate::proto_gen::GcsOptions; diff --git a/native/src/proto.rs b/native/src/proto.rs index 4f187bc..c1315f9 100644 --- a/native/src/proto.rs +++ b/native/src/proto.rs @@ -28,8 +28,8 @@ use jni::sys::{jbyteArray, jlong}; use jni::JNIEnv; use prost::Message; -use crate::errors::{try_unwrap_or_throw, JniResult}; use crate::runtime; +use datafusion_jni_common::errors::{try_unwrap_or_throw, JniResult}; #[no_mangle] pub extern "system" fn Java_org_apache_datafusion_SessionContext_createDataFrameFromProto< diff --git a/native/src/runtime_metrics.rs b/native/src/runtime_metrics.rs index e69410e..dd60dcb 100644 --- a/native/src/runtime_metrics.rs +++ b/native/src/runtime_metrics.rs @@ -38,7 +38,7 @@ //! 10 totalOverflowCount #[cfg(not(feature = "runtime-metrics"))] -use crate::errors::JniResult; +use datafusion_jni_common::errors::JniResult; /// Number of i64 values in the snapshot array; kept here so the Java side and /// the feature-off stub agree on the layout. @@ -51,7 +51,7 @@ mod imp { use tokio_metrics::{RuntimeIntervals, RuntimeMonitor}; use super::STATS_FIELD_COUNT; - use crate::errors::JniResult; + use datafusion_jni_common::errors::JniResult; /// `RuntimeMonitor::intervals().next()` returns *delta* metrics covering /// the period since the previous call (or, on the very first call, since @@ -196,7 +196,7 @@ pub fn runtime_stats() -> JniResult<[i64; STATS_FIELD_COUNT]> { Err( "datafusion-jni was built without the `runtime-metrics` Cargo feature; \ rebuild the native crate with \ - `RUSTFLAGS=\"--cfg tokio_unstable\" cargo build --features runtime-metrics` \ + `RUSTFLAGS=\"--cfg tokio_unstable\" cargo build -p datafusion-jni --features runtime-metrics` \ to enable SessionContext.runtimeStats" .into(), ) diff --git a/native/src/schema.rs b/native/src/schema.rs index 968a73a..0c3c7ab 100644 --- a/native/src/schema.rs +++ b/native/src/schema.rs @@ -20,7 +20,7 @@ use datafusion::arrow::ipc::reader::StreamReader; use jni::objects::JByteArray; use jni::JNIEnv; -use crate::errors::JniResult; +use datafusion_jni_common::errors::JniResult; /// Decode an optional Arrow-IPC schema byte array passed in from Java. /// Returns `None` if the byte-array reference is null. diff --git a/pom.xml b/pom.xml index 6210841..b92cf72 100644 --- a/pom.xml +++ b/pom.xml @@ -95,6 +95,11 @@ under the License. + + org.apache.maven.plugins + maven-compiler-plugin + 3.13.0 + org.apache.maven.plugins maven-surefire-plugin @@ -159,6 +164,7 @@ under the License. README.md CONTRIBUTING.md docs/** + **/*.md .gitignore .idea/** @@ -173,12 +179,17 @@ under the License. .mvn/** **/target/** - native/target/** + rust-target/** tpch-data/** - - native/Cargo.lock + + Cargo.lock + + **/META-INF/services/** dev/release/rat_exclude_files.txt + + spark/scaffold/bridge-template/** From fd4dd74f3d8184a2002cd29131bc3d07fe9c525f Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 12 Jun 2026 14:08:27 +0200 Subject: [PATCH 2/8] build: inherit crate version/edition from workspace; publish datafusion-jni-common Add [workspace.package] (version, edition, license, repository) to the root manifest and have both crates inherit it via `*.workspace = true`, so a single bump re-versions the workspace in lock step. Make datafusion-jni-common publishable: drop `publish = false` and add a `description` (crates.io requires it). All its dependencies are registry crates, so nothing blocks publish. datafusion-jni stays `publish = false` since it is a JVM-loaded cdylib, not a crates.io library. Co-Authored-By: Claude Opus 4.8 (1M context) --- Cargo.toml | 9 +++++++++ native-common/Cargo.toml | 8 +++++--- native/Cargo.toml | 7 +++++-- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index d7f98f7..fd1971a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,6 +22,15 @@ members = [ "native-common", ] +# Shared package metadata so every crate moves in lock step. Members inherit +# via `version.workspace = true` / `edition.workspace = true` etc.; a single +# bump here re-versions the whole workspace. +[workspace.package] +version = "0.1.0" +edition = "2021" +license = "Apache-2.0" +repository = "https://github.com/apache/datafusion-java" + # Every dependency used by any workspace member is declared here so version # bumps live in one place and the resolver picks a single version of each # crate across the workspace. Members reference these via `{ workspace = true }` diff --git a/native-common/Cargo.toml b/native-common/Cargo.toml index 0a797b4..ffad7c5 100644 --- a/native-common/Cargo.toml +++ b/native-common/Cargo.toml @@ -17,9 +17,11 @@ [package] name = "datafusion-jni-common" -version = "0.1.0" -edition = "2021" -publish = false +version.workspace = true +edition.workspace = true +license.workspace = true +repository.workspace = true +description = "Shared JNI plumbing for DataFusion Java native crates: error-to-exception mapping, the per-cdylib Tokio runtime singleton, and the async-stream-to-FFI_ArrowArrayStream bridge." [features] # `datafusion-jni` builds DataFusion with `avro`, which adds the diff --git a/native/Cargo.toml b/native/Cargo.toml index 507f0a9..c040448 100644 --- a/native/Cargo.toml +++ b/native/Cargo.toml @@ -17,8 +17,11 @@ [package] name = "datafusion-jni" -version = "0.1.0" -edition = "2021" +version.workspace = true +edition.workspace = true +license.workspace = true +repository.workspace = true +# cdylib JNI artifact loaded by the JVM, not a crates.io library. publish = false [lib] From 2e50d6e63fa52882cf81237dcd99297f64f4cc24 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 12 Jun 2026 14:10:11 +0200 Subject: [PATCH 3/8] build: drop forward-referencing RAT excludes from foundation split The `**/META-INF/services/**` and `spark/scaffold/bridge-template/**` RAT excludes guard files that do not exist until later splits in the stack. They are dead config here. Re-added in the splits that introduce the files they cover (META-INF/services in 04-spark-scala-connector, bridge-template in 05-bridge-scaffold). Co-Authored-By: Claude Opus 4.8 (1M context) --- pom.xml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pom.xml b/pom.xml index b92cf72..2d3ae4f 100644 --- a/pom.xml +++ b/pom.xml @@ -183,13 +183,8 @@ under the License. tpch-data/** Cargo.lock - - **/META-INF/services/** dev/release/rat_exclude_files.txt - - spark/scaffold/bridge-template/** From 9d2096c86b491a4eeaaee02e6cfb8cf0bf7508c1 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 12 Jun 2026 14:12:06 +0200 Subject: [PATCH 4/8] build: drop overbroad **/*.md RAT exclude The `**/*.md` exclude dropped every markdown file repo-wide from license checking, not just docs. It was redundant (docs are already covered by `docs/**`) and overly broad. Removing it; RAT still reports 0 unapproved because the remaining markdown carries valid headers. Co-Authored-By: Claude Opus 4.8 (1M context) --- pom.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/pom.xml b/pom.xml index 2d3ae4f..7ceec07 100644 --- a/pom.xml +++ b/pom.xml @@ -164,7 +164,6 @@ under the License. README.md CONTRIBUTING.md docs/** - **/*.md .gitignore .idea/** From 2514f56d6346b27a8cefb097c2544cb1a3c24600 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 12 Jun 2026 14:16:51 +0200 Subject: [PATCH 5/8] docs: add README for datafusion-jni-common Add a crate README so the crates.io listing has front-page content, and wire it in via `readme = "README.md"`. The ASF license header is an HTML comment (matching dev/release/README.md) so RAT approves it while it stays invisible in the rendered markdown. Co-Authored-By: Claude Opus 4.8 (1M context) --- native-common/Cargo.toml | 1 + native-common/README.md | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 native-common/README.md diff --git a/native-common/Cargo.toml b/native-common/Cargo.toml index ffad7c5..7ce7922 100644 --- a/native-common/Cargo.toml +++ b/native-common/Cargo.toml @@ -21,6 +21,7 @@ version.workspace = true edition.workspace = true license.workspace = true repository.workspace = true +readme = "README.md" description = "Shared JNI plumbing for DataFusion Java native crates: error-to-exception mapping, the per-cdylib Tokio runtime singleton, and the async-stream-to-FFI_ArrowArrayStream bridge." [features] diff --git a/native-common/README.md b/native-common/README.md new file mode 100644 index 0000000..aadf877 --- /dev/null +++ b/native-common/README.md @@ -0,0 +1,37 @@ + + +# datafusion-jni-common + +Shared JNI plumbing for the [Apache DataFusion Java](https://github.com/apache/datafusion-java) +native crates. It holds the pieces every DataFusion-backed `cdylib` loaded into a +JVM needs, factored out so they live in one place. + +## Linking model + +Each consuming `cdylib` statically links its own copy of this crate, so the +runtime singleton is per-library, not per-process. Nothing here is exported with +`#[no_mangle]`, so linking it into several `cdylib`s loaded in one JVM cannot +collide. + +## Status + +This crate is an implementation detail of Apache DataFusion Java. Its API may +change between releases to track the needs of the native crates that depend on +it. From 3ff4fda80bdf201fcb9b9fddd51071c39b172629 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 12 Jun 2026 14:27:12 +0200 Subject: [PATCH 6/8] fix: scope foundation docs/excludes to what 1/6 ships Address review feedback on the workspace-foundation PR: - development.md: trim the repo-layout section to the crates this PR actually ships (native, native-common). It was forward-referencing spark/, spark/bridge, datafusion-spark-bridge, and examples/native -- none of which exist until later PRs in the stack -- and called the member list "three" while listing four. Later PRs (#105/#106/#107/#109) carry notes to re-add their own slice when those dirs land. - rat_exclude_files.txt: the Rust lockfile moved to the workspace root, so the stale native/Cargo.lock entry left the root Cargo.lock with no RAT exclude for the source-tarball check (check-rat-report.py). Point it at Cargo.lock. - native-common: dedupe the panic-payload downcast -- StreamingReader::next now calls errors::panic_message instead of repeating the String/&str match inline. Co-Authored-By: Claude Opus 4.8 (1M context) --- dev/release/rat_exclude_files.txt | 2 +- docs/source/contributor-guide/development.md | 25 +++++++++----------- native-common/src/errors.rs | 5 +++- native-common/src/lib.rs | 8 +------ 4 files changed, 17 insertions(+), 23 deletions(-) diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 81d83e8..3dbd90f 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -7,7 +7,7 @@ .mvn/wrapper/maven-wrapper.properties mvnw mvnw.cmd -native/Cargo.lock +Cargo.lock dev/release/rat_exclude_files.txt docs/source/_static/** docs/source/conf.py diff --git a/docs/source/contributor-guide/development.md b/docs/source/contributor-guide/development.md index fdb00f4..61d4fb0 100644 --- a/docs/source/contributor-guide/development.md +++ b/docs/source/contributor-guide/development.md @@ -74,25 +74,22 @@ disk space. The repository is a multi-module Maven build: -- `Cargo.toml` — Rust workspace root declaring the three crate members - (`native`, `native-common`, `examples/native`, `spark/bridge`) and `[workspace.dependencies]` - that pin shared versions in one place. Cargo writes artifacts to - `rust-target/` (overridden in `.cargo/config.toml`) so `mvn clean` at the - repo root does not nuke the Rust build cache. -- `pom.xml` — parent POM declaring the `core`, `spark`, and `examples` - modules and shared plugin/dependency versions. +- `Cargo.toml` — Rust workspace root declaring the crate members + (`native`, `native-common`) and `[workspace.dependencies]` that pin + shared versions in one place. Cargo writes artifacts to `rust-target/` + (overridden in `.cargo/config.toml`) so `mvn clean` at the repo root does + not nuke the Rust build cache. +- `pom.xml` — parent POM declaring the `core` and `examples` modules and + shared plugin/dependency versions. - `core/` — `datafusion-java` library module (Java sources, tests, and generated protobuf classes). -- `spark/` — `datafusion-java-spark` Spark DataSource V2 connector - (Scala + Java, pure JVM) and its `spark/bridge/` Rust SDK crate - (`datafusion-spark-bridge`: widening, scan machinery, `export_bridge!`). - `examples/` — `datafusion-java-examples` module containing runnable examples that depend on the library; built alongside the library so they - cannot fall out of sync with the API. Includes `examples/native/`, a - small `export_bridge!` cdylib used by the Spark connector demo - (`ExampleBridgeProviderFactory` + the pyspark script under - `examples/python/`). + cannot fall out of sync with the API. - `native/` — `datafusion-jni` Rust crate (JNI + Arrow C Data Interface). +- `native-common/` — `datafusion-jni-common` Rust crate: JNI plumbing + shared across native crates (error→exception mapping, the per-cdylib + Tokio runtime singleton, the async-stream→`FFI_ArrowArrayStream` bridge). - `proto/` — Protobuf definitions shared between Java and Rust. - `Makefile` — top-level build orchestration (`make test`, `make format`, `make tpch-data`). diff --git a/native-common/src/errors.rs b/native-common/src/errors.rs index caa2540..f9dbb03 100644 --- a/native-common/src/errors.rs +++ b/native-common/src/errors.rs @@ -164,7 +164,10 @@ fn throw(env: &mut JNIEnv, class: &str, message: &str) { let _ = env.throw_new(class, message); } -fn panic_message(panic: &Box) -> String { +/// Best-effort extraction of a panic payload's message. `catch_unwind` hands +/// back a `Box`; the payload is a `String` or `&str` for ordinary +/// `panic!`/`unwrap` sites, anything else is opaque. +pub fn panic_message(panic: &Box) -> String { if let Some(s) = panic.downcast_ref::() { s.clone() } else if let Some(s) = panic.downcast_ref::<&str>() { diff --git a/native-common/src/lib.rs b/native-common/src/lib.rs index f143d43..ba47004 100644 --- a/native-common/src/lib.rs +++ b/native-common/src/lib.rs @@ -82,13 +82,7 @@ impl Iterator for StreamingReader { match next { Ok(item) => item.map(|r| r.map_err(|e| ArrowError::ExternalError(Box::new(e)))), Err(panic) => { - let msg = if let Some(s) = panic.downcast_ref::() { - s.clone() - } else if let Some(s) = panic.downcast_ref::<&str>() { - (*s).to_string() - } else { - "rust panic with non-string payload".to_string() - }; + let msg = errors::panic_message(&panic); Some(Err(ArrowError::ExternalError( format!("panic in DataFrame stream: {msg}").into(), ))) From bb023ebd52c79a85359a2be48bea1a06207199f4 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 12 Jun 2026 14:43:27 +0200 Subject: [PATCH 7/8] build: point release scripts at workspace rust-target/ dir The Cargo workspace conversion redirects build output to rust-target/ (via .cargo/config.toml), but the dev/release scripts still built from native/ and read native/target/release/, which is no longer populated even when cargo runs inside native/ (config is discovered up-tree). - build-native-libs.sh / build-release.sh: build from the repo root with `-p datafusion-jni` and copy from rust-target/{release,/release}/. - verify-release-candidate.sh: run `cargo fmt --all` workspace-wide so the new native-common crate is covered (matches CI lint.yml). - updating-datafusion-version.md: list the workspace.dependencies entries that actually exist (datafusion, -proto, -spark, -substrait); drop the stray datafusion-ffi reference. Co-Authored-By: Claude Opus 4.8 (1M context) --- dev/release/build-release.sh | 14 ++++++++------ .../datafusion-java-rm/build-native-libs.sh | 9 +++++---- dev/release/verify-release-candidate.sh | 3 ++- .../updating-datafusion-version.md | 4 ++-- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/dev/release/build-release.sh b/dev/release/build-release.sh index 2b033bb..4d4ab13 100755 --- a/dev/release/build-release.sh +++ b/dev/release/build-release.sh @@ -135,26 +135,28 @@ JVM_TARGET_DIR="$PROJECT_HOME/core/target/classes/org/apache/datafusion" mkdir -p "$JVM_TARGET_DIR/linux/amd64" docker cp \ - "$CONTAINER_AMD64:/opt/datafusion-java-rm/datafusion-java/native/target/release/libdatafusion_jni.so" \ + "$CONTAINER_AMD64:/opt/datafusion-java-rm/datafusion-java/rust-target/release/libdatafusion_jni.so" \ "$JVM_TARGET_DIR/linux/amd64/" mkdir -p "$JVM_TARGET_DIR/linux/aarch64" docker cp \ - "$CONTAINER_ARM64:/opt/datafusion-java-rm/datafusion-java/native/target/release/libdatafusion_jni.so" \ + "$CONTAINER_ARM64:/opt/datafusion-java-rm/datafusion-java/rust-target/release/libdatafusion_jni.so" \ "$JVM_TARGET_DIR/linux/aarch64/" echo "Building macOS native libs on the host (host=$HOST_ARCH)" rustup target add "$OTHER_DARWIN_TARGET" -(cd "$PROJECT_HOME/native" && cargo build --release) -(cd "$PROJECT_HOME/native" && cargo build --release --target "$OTHER_DARWIN_TARGET") +# Cargo writes to the workspace `rust-target/` dir (set in .cargo/config.toml), +# not the per-crate `native/target/`, so build from the repo root. +(cd "$PROJECT_HOME" && cargo build --release -p datafusion-jni) +(cd "$PROJECT_HOME" && cargo build --release -p datafusion-jni --target "$OTHER_DARWIN_TARGET") mkdir -p "$JVM_TARGET_DIR/darwin/$HOST_DARWIN_DIR" -cp "$PROJECT_HOME/native/target/release/libdatafusion_jni.dylib" \ +cp "$PROJECT_HOME/rust-target/release/libdatafusion_jni.dylib" \ "$JVM_TARGET_DIR/darwin/$HOST_DARWIN_DIR/" mkdir -p "$JVM_TARGET_DIR/darwin/$OTHER_DARWIN_DIR" -cp "$PROJECT_HOME/native/target/$OTHER_DARWIN_TARGET/release/libdatafusion_jni.dylib" \ +cp "$PROJECT_HOME/rust-target/$OTHER_DARWIN_TARGET/release/libdatafusion_jni.dylib" \ "$JVM_TARGET_DIR/darwin/$OTHER_DARWIN_DIR/" echo "Installing JAR into local Maven repo" diff --git a/dev/release/datafusion-java-rm/build-native-libs.sh b/dev/release/datafusion-java-rm/build-native-libs.sh index 5f273cc..79f8ae0 100755 --- a/dev/release/datafusion-java-rm/build-native-libs.sh +++ b/dev/release/datafusion-java-rm/build-native-libs.sh @@ -38,8 +38,9 @@ git clone "$REPO" datafusion-java cd datafusion-java git checkout "$BRANCH" -cd native -cargo build --release +# Cargo writes to the workspace `rust-target/` dir (set in .cargo/config.toml), +# not the per-crate `native/target/`, so build from the repo root. +cargo build --release -p datafusion-jni -echo "Built $(pwd)/target/release/libdatafusion_jni.so" -ls -l target/release/libdatafusion_jni.so +echo "Built $(pwd)/rust-target/release/libdatafusion_jni.so" +ls -l rust-target/release/libdatafusion_jni.so diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index e486adc..c7767bf 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -150,7 +150,8 @@ test_source_distribution() { # raises on any formatting errors rustup component add rustfmt - (cd native && cargo fmt --all -- --check) + # Workspace-wide: covers native, native-common, and any future members. + cargo fmt --all -- --check # build native + JVM and run the full test suite make test diff --git a/docs/source/contributor-guide/updating-datafusion-version.md b/docs/source/contributor-guide/updating-datafusion-version.md index ef6cd10..6e3b90b 100644 --- a/docs/source/contributor-guide/updating-datafusion-version.md +++ b/docs/source/contributor-guide/updating-datafusion-version.md @@ -21,8 +21,8 @@ under the License. Three things must move together when bumping DataFusion: -1. `Cargo.toml` (workspace root) — the `datafusion`, `datafusion-ffi`, - `datafusion-proto`, and `datafusion-substrait` entries in +1. `Cargo.toml` (workspace root) — the `datafusion`, `datafusion-proto`, + `datafusion-spark`, and `datafusion-substrait` entries in `[workspace.dependencies]`. Members inherit from there. 2. `pom.xml` — the `` Maven property. **Must equal the Cargo version**; a mismatch means JVM-built protobuf plans won't From 9e9de7d49de038b1853e1c8dc94a0b89ba22b3a4 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 12 Jun 2026 14:46:17 +0200 Subject: [PATCH 8/8] build: mark datafusion-jni-common publish = false No release script or doc describes publishing the Rust crates to crates.io, and the crate is an internal implementation detail of the native crates (its README already says the API may change to track their needs). Match `publish = false` on datafusion-jni so an accidental `cargo publish` can't push it. Co-Authored-By: Claude Opus 4.8 (1M context) --- native-common/Cargo.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/native-common/Cargo.toml b/native-common/Cargo.toml index 7ce7922..21a2296 100644 --- a/native-common/Cargo.toml +++ b/native-common/Cargo.toml @@ -21,6 +21,9 @@ version.workspace = true edition.workspace = true license.workspace = true repository.workspace = true +# Implementation detail of datafusion-java's native crates, not a standalone +# crates.io library. Matches `publish = false` on the `datafusion-jni` crate. +publish = false readme = "README.md" description = "Shared JNI plumbing for DataFusion Java native crates: error-to-exception mapping, the per-cdylib Tokio runtime singleton, and the async-stream-to-FFI_ArrowArrayStream bridge."