ANDROID: rust_binder: add priority inheritance

This implements priority inhertiance in the same way as how it is implemented in Android Linux. Change-Id: I0df7d4bc5d08c1f6568744701e5eaf90e86ecd00 Signed-off-by: Alice Ryhl <aliceryhl@google.com> Bug: 278052745
2025-07-19 07:39:54 +02:00 · 2023-08-29 13:28:13 +00:00 · 2023-08-29 13:28:13 +00:00 · 2e4f09314c
commit 2e4f09314c
parent 691f0f1682
10 changed files with 382 additions and 4 deletions
--- a/drivers/android/binder/defs.rs
+++ b/drivers/android/binder/defs.rs
@ -58,9 +58,17 @@ pub_no_prefix!(
    BC_DEAD_BINDER_DONE
 );

+pub_no_prefix!(
+    flat_binder_object_shifts_,
+    FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT
+);
+
 pub_no_prefix!(
    flat_binder_object_flags_,
    FLAT_BINDER_FLAG_ACCEPTS_FDS,
+    FLAT_BINDER_FLAG_INHERIT_RT,
+    FLAT_BINDER_FLAG_PRIORITY_MASK,
+    FLAT_BINDER_FLAG_SCHED_POLICY_MASK,
    FLAT_BINDER_FLAG_TXN_SECURITY_CTX
 );

--- a/drivers/android/binder/node.rs
+++ b/drivers/android/binder/node.rs
@ -16,6 +16,7 @@ use kernel::{
 use crate::{
    defs::*,
    error::BinderError,
+    prio::{self, BinderPriority},
    process::{NodeRefInfo, Process, ProcessInner},
    thread::Thread,
    transaction::Transaction,
@ -163,6 +164,22 @@ impl Node {
        }
    }

+    pub(crate) fn node_prio(&self) -> prio::BinderPriority {
+        let flags = self.flags;
+        let priority = (flags & FLAT_BINDER_FLAG_PRIORITY_MASK) as prio::Nice;
+        let sched_policy =
+            (flags & FLAT_BINDER_FLAG_SCHED_POLICY_MASK) >> FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT;
+
+        BinderPriority {
+            sched_policy,
+            prio: prio::to_kernel_prio(sched_policy, priority),
+        }
+    }
+
+    pub(crate) fn inherit_rt(&self) -> bool {
+        (self.flags & FLAT_BINDER_FLAG_INHERIT_RT) != 0
+    }
+
    /// An id that is unique across all binder nodes on the system. Used as the key in the
    /// `by_node` map.
    pub(crate) fn global_id(&self) -> usize {
--- a/drivers/android/binder/prio.rs
+++ b/drivers/android/binder/prio.rs
@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+
+// Copyright (C) 2024 Google LLC.
+
+//! This module defines the types and methods relevant to priority inheritance.
+
+use kernel::bindings;
+
+pub(crate) type Policy = core::ffi::c_uint;
+pub(crate) type Priority = core::ffi::c_int;
+pub(crate) type Nice = core::ffi::c_int;
+
+pub(crate) const SCHED_NORMAL: Policy = bindings::SCHED_NORMAL;
+pub(crate) const SCHED_FIFO: Policy = bindings::SCHED_FIFO;
+pub(crate) const MIN_NICE: Nice = bindings::MIN_NICE as _;
+pub(crate) const MAX_NICE: Nice = bindings::MAX_NICE as _;
+pub(crate) const DEFAULT_PRIO: Priority = bindings::DEFAULT_PRIO as _;
+pub(crate) const MAX_RT_PRIO: Priority = bindings::MAX_RT_PRIO as _;
+
+/// Scheduler policy and priority.
+///
+/// The binder driver supports inheriting the following scheduler policies:
+/// * SCHED_NORMAL
+/// * SCHED_BATCH
+/// * SCHED_FIFO
+/// * SCHED_RR
+#[derive(Copy, Clone, Default)]
+pub(crate) struct BinderPriority {
+    pub(crate) sched_policy: Policy,
+    pub(crate) prio: Priority,
+}
+
+#[derive(Copy, Clone, Eq, PartialEq)]
+pub(crate) enum PriorityState {
+    Set,
+    Pending,
+    Abort,
+}
+
+pub(crate) fn get_default_prio_from_task(task: &kernel::task::Task) -> BinderPriority {
+    if is_supported_policy(task.policy()) {
+        BinderPriority {
+            sched_policy: task.policy(),
+            prio: task.normal_prio(),
+        }
+    } else {
+        BinderPriority {
+            sched_policy: SCHED_NORMAL,
+            prio: DEFAULT_PRIO,
+        }
+    }
+}
+
+pub(crate) fn is_rt_policy(policy: Policy) -> bool {
+    policy == bindings::SCHED_FIFO || policy == bindings::SCHED_RR
+}
+
+pub(crate) fn is_fair_policy(policy: Policy) -> bool {
+    policy == bindings::SCHED_NORMAL || policy == bindings::SCHED_BATCH
+}
+
+pub(crate) fn is_supported_policy(policy: Policy) -> bool {
+    is_fair_policy(policy) || is_rt_policy(policy)
+}
+
+pub(crate) fn to_userspace_prio(policy: Policy, prio: Priority) -> Nice {
+    if is_fair_policy(policy) {
+        prio - DEFAULT_PRIO
+    } else {
+        MAX_RT_PRIO - 1 - prio
+    }
+}
+
+pub(crate) fn to_kernel_prio(policy: Policy, prio: Nice) -> Priority {
+    if is_fair_policy(policy) {
+        prio + DEFAULT_PRIO
+    } else {
+        MAX_RT_PRIO - 1 - prio
+    }
+}
--- a/drivers/android/binder/process.rs
+++ b/drivers/android/binder/process.rs
@ -37,6 +37,7 @@ use crate::{
    defs::*,
    error::{BinderError, BinderResult},
    node::{Node, NodeDeath, NodeRef},
+    prio::{self, BinderPriority},
    range_alloc::{self, RangeAllocator},
    thread::{PushWorkRes, Thread},
    DArc, DLArc, DTRWrap, DeliverToRead,
@ -137,6 +138,8 @@ impl ProcessInner {
    ) -> Result<(), (BinderError, DLArc<dyn DeliverToRead>)> {
        // Try to find a ready thread to which to push the work.
        if let Some(thread) = self.ready_threads.pop_front() {
+            work.on_thread_selected(&thread);
+
            // Push to thread while holding state lock. This prevents the thread from giving up
            // (for example, because of a signal) when we're about to deliver work.
            match thread.push_work(work) {
@ -390,6 +393,8 @@ pub(crate) struct Process {
    #[pin]
    pub(crate) inner: SpinLock<ProcessInner>,

+    pub(crate) default_priority: BinderPriority,
+
    // Waitqueue of processes waiting for all outstanding transactions to be
    // processed.
    #[pin]
@ -447,13 +452,15 @@ impl workqueue::WorkItem for Process {

 impl Process {
    fn new(ctx: Arc<Context>, cred: ARef<Credential>) -> Result<Arc<Self>> {
+        let current = kernel::current!();
        let list_process = ListArc::pin_init(pin_init!(Process {
            ctx,
            cred,
+            default_priority: prio::get_default_prio_from_task(current),
            inner <- kernel::new_spinlock!(ProcessInner::new(), "Process::inner"),
            node_refs <- kernel::new_mutex!(ProcessNodeRefs::new(), "Process::node_refs"),
            freeze_wait <- kernel::new_condvar!("Process::freeze_wait"),
-            task: kernel::current!().group_leader().into(),
+            task: current.group_leader().into(),
            defer_work <- kernel::new_work!("Process::defer_work"),
            links <- ListLinks::new(),
        }))?;
--- a/drivers/android/binder/rust_binder.rs
+++ b/drivers/android/binder/rust_binder.rs
@ -3,6 +3,7 @@
 // Copyright (C) 2024 Google LLC.

 //! Binder -- the Android IPC mechanism.
+#![recursion_limit = "256"]

 use kernel::{
    bindings::{self, seq_file},
@ -26,6 +27,7 @@ mod context;
 mod defs;
 mod error;
 mod node;
+mod prio;
 mod process;
 mod range_alloc;
 mod thread;
@ -55,6 +57,10 @@ trait DeliverToRead: ListArcSafe + Send + Sync {
    /// won't be delivered.
    fn cancel(self: DArc<Self>) {}

+    /// Called when a work item is delivered directly to a specific thread, rather than to the
+    /// process work list.
+    fn on_thread_selected(&self, _thread: &thread::Thread) {}
+
    /// Should we use `wake_up_interruptible_sync` or `wake_up_interruptible` when scheduling this
    /// work item?
    ///
--- a/drivers/android/binder/thread.rs
+++ b/drivers/android/binder/thread.rs
@ -18,7 +18,8 @@ use kernel::{
    security,
    sync::poll::{PollCondVar, PollTable},
    sync::{Arc, SpinLock},
-    types::Either,
+    task::Task,
+    types::{ARef, Either},
    uaccess::{UserSlice, UserSliceWriter},
 };

@ -26,6 +27,7 @@ use crate::{
    allocation::{Allocation, AllocationView, BinderObject, BinderObjectRef},
    defs::*,
    error::BinderResult,
+    prio::{self, BinderPriority, PriorityState},
    process::Process,
    ptr_align,
    transaction::Transaction,
@ -403,14 +405,22 @@ impl InnerThread {
    }
 }

+pub(crate) struct ThreadPrioState {
+    pub(crate) state: PriorityState,
+    pub(crate) next: BinderPriority,
+}
+
 /// This represents a thread that's used with binder.
 #[pin_data]
 pub(crate) struct Thread {
    pub(crate) id: i32,
    pub(crate) process: Arc<Process>,
+    pub(crate) task: ARef<Task>,
    #[pin]
    inner: SpinLock<InnerThread>,
    #[pin]
+    pub(crate) prio_lock: SpinLock<ThreadPrioState>,
+    #[pin]
    work_condvar: PollCondVar,
    /// Used to insert this thread into the process' `ready_threads` list.
    ///
@ -439,10 +449,17 @@ impl Thread {
    pub(crate) fn new(id: i32, process: Arc<Process>) -> Result<Arc<Self>> {
        let inner = InnerThread::new()?;

+        let prio = ThreadPrioState {
+            state: PriorityState::Set,
+            next: BinderPriority::default(),
+        };
+
        Arc::pin_init(pin_init!(Thread {
            id,
            process,
+            task: ARef::from(kernel::current!()),
            inner <- kernel::new_spinlock!(inner, "Thread::inner"),
+            prio_lock <- kernel::new_spinlock!(prio, "Thread::prio_lock"),
            work_condvar <- kernel::new_poll_condvar!("Thread::work_condvar"),
            links <- ListLinks::new(),
            links_track <- AtomicListArcTracker::new(),
@ -538,6 +555,8 @@ impl Thread {
                return Ok(Some(work));
            }

+            self.restore_priority(&self.process.default_priority);
+
            inner.looper_flags |= LOOPER_WAITING | LOOPER_WAITING_PROC;
            let signal_pending = self.work_condvar.wait_interruptible_freezable(&mut inner);
            inner.looper_flags &= !(LOOPER_WAITING | LOOPER_WAITING_PROC);
@ -600,6 +619,90 @@ impl Thread {
        self.inner.lock().push_return_work(reply);
    }

+    fn do_set_priority(&self, desired: &BinderPriority, verify: bool) {
+        let task = &*self.task;
+        let mut policy = desired.sched_policy;
+        let mut priority;
+
+        if task.policy() == policy && task.normal_prio() == desired.prio {
+            let mut prio_state = self.prio_lock.lock();
+            if prio_state.state == PriorityState::Pending {
+                prio_state.state = PriorityState::Set;
+            }
+            return;
+        }
+
+        let has_cap_nice = task.has_capability_noaudit(bindings::CAP_SYS_NICE as _);
+        priority = prio::to_userspace_prio(policy, desired.prio);
+
+        if verify && prio::is_rt_policy(policy) && !has_cap_nice {
+            // For rt_policy, we store the rt priority as a nice. (See to_userspace_prio and
+            // to_kernel_prio impls.)
+            let max_rtprio: prio::Nice = task.rlimit_rtprio();
+            if max_rtprio == 0 {
+                policy = prio::SCHED_NORMAL;
+                priority = prio::MIN_NICE;
+            } else if priority > max_rtprio {
+                priority = max_rtprio;
+            }
+        }
+
+        if verify && prio::is_fair_policy(policy) && !has_cap_nice {
+            let min_nice = task.rlimit_nice();
+
+            if min_nice > prio::MAX_NICE {
+                pr_err!("{} RLIMIT_NICE not set", task.pid());
+                return;
+            } else if priority < min_nice {
+                priority = min_nice;
+            }
+        }
+
+        if policy != desired.sched_policy || prio::to_kernel_prio(policy, priority) != desired.prio
+        {
+            pr_debug!(
+                "{}: priority {} not allowed, using {} instead",
+                task.pid(),
+                desired.prio,
+                prio::to_kernel_prio(policy, priority),
+            );
+        }
+
+        let mut prio_state = self.prio_lock.lock();
+        if !verify && prio_state.state == PriorityState::Abort {
+            // A new priority has been set by an incoming nested
+            // transaction. Abort this priority restore and allow
+            // the transaction to run at the new desired priority.
+            drop(prio_state);
+            pr_debug!("{}: aborting priority restore", task.pid());
+            return;
+        }
+
+        // Set the actual priority.
+        if task.policy() != policy || prio::is_rt_policy(policy) {
+            let prio = if prio::is_rt_policy(policy) {
+                priority
+            } else {
+                0
+            };
+            task.sched_setscheduler_nocheck(policy as i32, prio, true);
+        }
+
+        if prio::is_fair_policy(policy) {
+            task.set_user_nice(priority);
+        }
+
+        prio_state.state = PriorityState::Set;
+    }
+
+    pub(crate) fn set_priority(&self, desired: &BinderPriority) {
+        self.do_set_priority(desired, true);
+    }
+
+    pub(crate) fn restore_priority(&self, desired: &BinderPriority) {
+        self.do_set_priority(desired, false);
+    }
+
    fn translate_object(
        &self,
        obj_index: usize,
@ -1171,7 +1274,7 @@ impl Thread {
        }

        // We need to complete the transaction even if we cannot complete building the reply.
-        (|| -> BinderResult<_> {
+        let out = (|| -> BinderResult<_> {
            let completion = DTRWrap::arc_try_new(DeliverCode::new(BR_TRANSACTION_COMPLETE))?;
            let process = orig.from.process.clone();
            let allow_fds = orig.flags & TF_ACCEPT_FDS != 0;
@ -1191,7 +1294,11 @@ impl Thread {
            orig.from.deliver_reply(reply, &orig);
            err.reply = BR_TRANSACTION_COMPLETE;
            err
-        })
+        });
+
+        // Restore the priority even on failure.
+        self.restore_priority(&orig.saved_priority());
+        out
    }

    fn oneway_transaction_inner(self: &Arc<Self>, tr: &BinderTransactionDataSg) -> BinderResult {
--- a/drivers/android/binder/transaction.rs
+++ b/drivers/android/binder/transaction.rs
@ -17,6 +17,7 @@ use crate::{
    defs::*,
    error::{BinderError, BinderResult},
    node::{Node, NodeRef},
+    prio::{self, BinderPriority, PriorityState},
    process::{Process, ProcessInner},
    ptr_align,
    thread::{PushWorkRes, Thread},
@ -32,6 +33,10 @@ pub(crate) struct Transaction {
    #[pin]
    allocation: SpinLock<Option<Allocation>>,
    is_outstanding: AtomicBool,
+    set_priority_called: AtomicBool,
+    priority: BinderPriority,
+    #[pin]
+    saved_priority: SpinLock<BinderPriority>,
    code: u32,
    pub(crate) flags: u32,
    data_size: usize,
@ -87,6 +92,16 @@ impl Transaction {
        alloc.set_info_target_node(node_ref);
        let data_address = alloc.ptr;

+        let priority =
+            if (trd.flags & TF_ONE_WAY == 0) && prio::is_supported_policy(from.task.policy()) {
+                BinderPriority {
+                    sched_policy: from.task.policy(),
+                    prio: from.task.normal_prio(),
+                }
+            } else {
+                from.process.default_priority
+            };
+
        Ok(DTRWrap::arc_pin_init(pin_init!(Transaction {
            target_node: Some(target_node),
            from_parent,
@ -100,6 +115,9 @@ impl Transaction {
            data_address,
            allocation <- kernel::new_spinlock!(Some(alloc), "Transaction::new"),
            is_outstanding: AtomicBool::new(false),
+            priority,
+            saved_priority <- kernel::new_spinlock!(BinderPriority::default(), "Transaction::saved_priority"),
+            set_priority_called: AtomicBool::new(false),
            txn_security_ctx_off,
            oneway_spam_detected,
        }))?)
@ -136,11 +154,18 @@ impl Transaction {
            data_address: alloc.ptr,
            allocation <- kernel::new_spinlock!(Some(alloc), "Transaction::new"),
            is_outstanding: AtomicBool::new(false),
+            priority: BinderPriority::default(),
+            saved_priority <- kernel::new_spinlock!(BinderPriority::default(), "Transaction::saved_priority"),
+            set_priority_called: AtomicBool::new(false),
            txn_security_ctx_off: None,
            oneway_spam_detected,
        }))?)
    }

+    pub(crate) fn saved_priority(&self) -> BinderPriority {
+        *self.saved_priority.lock()
+    }
+
    /// Determines if the transaction is stacked on top of the given transaction.
    pub(crate) fn is_stacked_on(&self, onext: &Option<DArc<Self>>) -> bool {
        match (&self.from_parent, onext) {
@ -309,6 +334,11 @@ impl DeliverToRead for Transaction {
            }
            self.drop_outstanding_txn();
        });
+
+        // Update thread priority. This only has an effect if the transaction is delivered via the
+        // process work list, since the priority has otherwise already been updated.
+        self.on_thread_selected(thread);
+
        let files = if let Ok(list) = self.prepare_file_list() {
            list
        } else {
@ -391,6 +421,56 @@ impl DeliverToRead for Transaction {
        self.drop_outstanding_txn();
    }

+    fn on_thread_selected(&self, to_thread: &Thread) {
+        // Return immediately if reply.
+        let target_node = match self.target_node.as_ref() {
+            Some(target_node) => target_node,
+            None => return,
+        };
+
+        // We only need to do this once.
+        if self.set_priority_called.swap(true, Ordering::Relaxed) {
+            return;
+        }
+
+        let node_prio = target_node.node_prio();
+        let mut desired = self.priority;
+
+        if !target_node.inherit_rt() && prio::is_rt_policy(desired.sched_policy) {
+            desired.prio = prio::DEFAULT_PRIO;
+            desired.sched_policy = prio::SCHED_NORMAL;
+        }
+
+        if node_prio.prio < self.priority.prio
+            || (node_prio.prio == self.priority.prio && node_prio.sched_policy == prio::SCHED_FIFO)
+        {
+            // In case the minimum priority on the node is
+            // higher (lower value), use that priority. If
+            // the priority is the same, but the node uses
+            // SCHED_FIFO, prefer SCHED_FIFO, since it can
+            // run unbounded, unlike SCHED_RR.
+            desired = node_prio;
+        }
+
+        let mut prio_state = to_thread.prio_lock.lock();
+        if prio_state.state == PriorityState::Pending {
+            // Task is in the process of changing priorities
+            // saving its current values would be incorrect.
+            // Instead, save the pending priority and signal
+            // the task to abort the priority restore.
+            prio_state.state = PriorityState::Abort;
+            *self.saved_priority.lock() = prio_state.next;
+        } else {
+            let task = &*self.to.task;
+            let mut saved_priority = self.saved_priority.lock();
+            saved_priority.sched_policy = task.policy();
+            saved_priority.prio = task.normal_prio();
+        }
+        drop(prio_state);
+
+        to_thread.set_priority(&desired);
+    }
+
    fn should_sync_wakeup(&self) -> bool {
        self.flags & TF_ONE_WAY == 0
    }
--- a/rust/bindings/bindings_helper.h
+++ b/rust/bindings/bindings_helper.h
@ -7,6 +7,7 @@
 */

 #include <kunit/test.h>
+#include <linux/capability.h>
 #include <linux/cred.h>
 #include <linux/errname.h>
 #include <linux/fdtable.h>
@ -26,6 +27,7 @@
 #include <linux/workqueue.h>
 #include <uapi/linux/android/binder.h>
 #include <uapi/linux/android/binderfs.h>
+#include <uapi/linux/sched/types.h>

 /* `bindgen` gets confused at certain things. */
 const size_t RUST_CONST_HELPER_ARCH_SLAB_MINALIGN = ARCH_SLAB_MINALIGN;
--- a/rust/helpers.c
+++ b/rust/helpers.c
@ -313,6 +313,13 @@ void rust_helper_init_task_work(struct callback_head *twork,
 }
 EXPORT_SYMBOL_GPL(rust_helper_init_task_work);

+unsigned long rust_helper_task_rlimit(const struct task_struct *task,
+				      unsigned int limit)
+{
+	return task_rlimit(task, limit);
+}
+EXPORT_SYMBOL_GPL(rust_helper_task_rlimit);
+
 void rust_helper_rb_link_node(struct rb_node *node, struct rb_node *parent,
 			      struct rb_node **rb_link)
 {
--- a/rust/kernel/task.rs
+++ b/rust/kernel/task.rs
@ -212,6 +212,70 @@ impl Task {
        // running.
        unsafe { bindings::wake_up_process(self.0.get()) };
    }
+
+    /// Check if the task has the given capability without logging to the audit log.
+    pub fn has_capability_noaudit(&self, capability: i32) -> bool {
+        // SAFETY: By the type invariant, we know that `self.0.get()` is valid.
+        unsafe { bindings::has_capability_noaudit(self.0.get(), capability) }
+    }
+
+    /// Returns the current scheduling policy.
+    pub fn policy(&self) -> u32 {
+        // SAFETY: The file is valid because the shared reference guarantees a nonzero refcount.
+        //
+        // This uses a volatile read because C code may be modifying this field in parallel using
+        // non-atomic unsynchronized writes. This corresponds to how the C macro READ_ONCE is
+        // implemented.
+        unsafe { core::ptr::addr_of!((*self.0.get()).policy).read_volatile() }
+    }
+
+    /// Returns the current normal priority.
+    pub fn normal_prio(&self) -> i32 {
+        // SAFETY: The file is valid because the shared reference guarantees a nonzero refcount.
+        //
+        // This uses a volatile read because C code may be modifying this field in parallel using
+        // non-atomic unsynchronized writes. This corresponds to how the C macro READ_ONCE is
+        // implemented.
+        unsafe { core::ptr::addr_of!((*self.0.get()).normal_prio).read_volatile() }
+    }
+
+    /// Get the rlimit value for RTPRIO.
+    pub fn rlimit_rtprio(&self) -> i32 {
+        // SAFETY: By the type invariant, we know that `self.0.get()` is valid, and RLIMIT_RTPRIO
+        // is a valid limit type.
+        unsafe { bindings::task_rlimit(self.0.get(), bindings::RLIMIT_RTPRIO) as i32 }
+    }
+
+    /// Get the rlimit value for NICE, converted to a nice value.
+    pub fn rlimit_nice(&self) -> i32 {
+        // SAFETY: By the type invariant, we know that `self.0.get()` is valid, and RLIMIT_NICE
+        // is a valid limit type.
+        let prio = unsafe { bindings::task_rlimit(self.0.get(), bindings::RLIMIT_NICE) as i32 };
+        // Convert rlimit style value [1,40] to nice value [-20, 19].
+        bindings::MAX_NICE as i32 - prio + 1
+    }
+
+    /// Set the scheduling properties for this task without checking whether the task is allowed to
+    /// set them.
+    pub fn sched_setscheduler_nocheck(
+        &self,
+        policy: i32,
+        sched_priority: i32,
+        reset_on_fork: bool,
+    ) {
+        let params = bindings::sched_param { sched_priority };
+
+        let mut policy = policy;
+        if reset_on_fork {
+            policy |= bindings::SCHED_RESET_ON_FORK as i32;
+        }
+        unsafe { bindings::sched_setscheduler_nocheck(self.0.get(), policy, &params) };
+    }
+
+    /// Set the nice value of this task.
+    pub fn set_user_nice(&self, nice: i32) {
+        unsafe { bindings::set_user_nice(self.0.get(), nice as _) };
+    }
 }

 impl Kuid {