Distinguish probability distributions
A source that outputs symbols. With prob dist q. Makes symbol k.
Is the prob dist q₁ or q₂?
We ask the source to make a symbol. Then we guess the probability distribution.
import Random_variable variables (qₙ₁ qₙ₂ : (ℕ → ι) → ℝ) [rnd_var n qₙ₁] [rnd_var n qₙ₂] /- When might we guess q₁ for a given symbol k? When the prob of k occuring according to q₁ is large. -/ /-- if output symbol is one of these, then guess q₁ -/ def U (T : ℝ) := { k | (qₙ₁ k) ≥ (qₙ₂ k) * exp T } -- Note: k is a sequence of symbols. It can have len 1 or more. -- rewrite example : U (T : ℝ) = { k | log(qₙ₁ k / qₙ₂ k) ≥ T } /- What if T was a function of k? A third prob dist maybe? Can we guess better? -/
Let the source be a 6 faced die.
Let q₁ := {'1': 1, '2': 0, '3': 0, '4': 0, '5': 0, '6': 0} Let q₂ := {'1': 0, '2': 0, '3': 0, '4': 0, '5': 0, '6': 1}
U (T : ℝ) := { k | (q₁ k) ≥ (q₂ k) * exp T } = {1, 2, 3, 4, 5}
We roll the die and get 1 uhhh, q₁?
We roll the die and get 1 uhhh, q₁? correct!
We roll the die and get 6 q₂?
We roll the die and get 6 q₂? yes!
Let q₁ := {'1': 7/8, '2': 0, '3': 0, '4': 0, '5': 0, '6': 1/8} Let q₂ := {'1': 1/8, '2': 0, '3': 0, '4': 0, '5': 0, '6': 7/8}
U (T : ℝ) := { k | (q₁ k) ≥ (q₂ k) * exp T } = {1, 2, 3, 4, 5} for exp T = 1/2
the die can be q₁ and produce a symbol k ∉ U T, like 6 in this case our strategy fails. And we guess q₂ incorrectly.
what's the probability of this happening?
/-- Prob. of error: guess q₁ when actually q₂ -/
def α (T : ℝ) : ℝ := ∑ k ∈ (U T), (qₙ₂ k)
/-- Prob. of error: guess q₂ when actually q₁ -/
def β (T : ℝ) : ℝ := ∑ k ∉ (U T), (qₙ₁ k)
we want α and β as small as possible
How to choose U T?
If we reduce U T then we reduce α
If we reduce U T then we increase β
theorem trade_off (T₁ T₂ : R) : β qₙ₁ qₙ₂ T₁ < β qₙ₁ qₙ₂ T₂ → α qₙ₁ qₙ₂ T₁ > α qₙ₁ qₙ₂ T₂ := sorry
🤔
The Neyman-Pearson approach:
The Neyman-Pearson approach: Assume β ≤ βₘ.
The Neyman-Pearson approach: Assume β ≤ βₘ. Then minimize α.
import Relative_entropy /-- How close to q₂ can we get if we stay ε-close to q₁? -/ def err_exp (ε : ℝ) := min { b : ℝ | ∀ qₙ, b = discrimination qₙ qₙ₂ ∧ discrimination qₙ₁ qₙ ≤ ε } example : err_exp qₙ₁ qₙ₂ 0 = discrimination qₙ₁ qₙ₂ := sorry lemma lim_err_exp {ε : ℝ} : lim at_top λ n, err_exp qₙ₁ qₙ₂ ε/n = discrimination qₙ₁ qₙ₂ := by sorry /-- can get δ-close to q₂ when ε-close to q₁ -/ def achieves_err_exp [rnd_var n qₙ] [discrimination qₙ₁ qₙ ≤ ε] (δ : ℝ) := discrimination qₙ qₙ₂ ≥ δ /-- not closer to q₂ if closer to q₁ -/ lemma err_exp_nonincreasing : ε > ε' → err_exp qₙ₁ qₙ₂ ε ≤ err_exp qₙ₁ qₙ₂ ε' := sorry variables [iid qₙ q] lemma err_exp_of_iid [iid qₙ₁ q₁] [iid qₙ₂ q₂] : err_exp qₙ₁ qₙ₂ ε = n * err_exp q₁ q₂ (ε/n) := begin calc err_exp qₙ₁ qₙ₂ ε = min { discrimination qₙ qₙ₂ | discrimination qₙ₁ qₙ ≤ ε } : by rw err_exp ... = min { n * discrimination q q₂ | n * discrimination q₁ q ≤ ε } : by rw discrimination_additive_of_iid ... = min { n * discrimination q q₂ | discrimination q₁ q ≤ ε/n } : by simp ... = n * min { discrimination q q₂ | discrimination q₁ q ≤ ε/n } : by simp ... = n * err_exp q₁ q₂ (ε/n) : by rw err_exp, end
def ψ (s : ℝ) := log( ∑ k, (qₙ₁ k)^s * ((qₙ₁ k) * (qₙ₂ k))^(1 / (1+s)) )^(1+s)
def φ (s : ℝ) (r : ℝ) := - s * r - (ψ s)
/-
"tangent to err_exp r at some s
and otherwise below it since err_exp is convex"
-/
example : d/ds ψ(s)|0 = discrimination qₙ₁ qₙ₂ := sorry
theorem err_exp_rw :
err_exp qₙ₁ qₙ₂ r = max { φ s r | s ≥ 0 } := sorry
variables (r : ℝ) [r > 0]
(T : ℝ) [T = err_exp qₙ₁ qₙ₂ r - r] -- consider this partition
/-- Theorem 9 in Blahut1974: larger r → smaller β -/
theorem prob_of_β_error_le : β qₙ₁ qₙ₂ T ≤ exp(-r) :=
begin
/- Let s parametrize the point (r, err_exp qₙ₁ qₙ₂ r) -/
calc β qₙ₁ qₙ₂ T = ∑ k ∉ U T, qₙ₁ k : by sorry
... ≤ ∑ k, (qₙ₁ k) * (qₙ₂ k / qₙ₁ k)^(1/(1+s)) * exp(T/(1+s))) : by sorry
... = exp(-r) : by sorry,
end
/--
smaller error α if q's r-close to q₁ are farther from q₂
Theorem 9 in Blahut1974
-/
theorem prob_of_α_error_le :
α qₙ₁ qₙ₂ T ≤ exp(-err_exp qₙ₁ qₙ₂ r) :=
begin
/- Let s parametrize the point (r, err_exp r) -/
calc α qₙ₁ qₙ₂ T = ∑ k ∈ U T, qₙ₂ k : by sorry
... ≤ ∑ k, (qₙ₂ k) * (qₙ₁ k / qₙ₂ k)^(s/(1+s)) * exp(-T*s/(1+s))) : by sorry
... = exp(-err_exp qₙ₁ qₙ₂ r) : by sorry,
end
/-- Corollary 1 in Blahut1974 -/
theorem prob_of_α_error_le_of_iid [iid qₙ₁ q₁] [iid qₙ₂ q₂] :
α qₙ₁ qₙ₂ T ≤ exp(-n * err_exp q₁ q₂ r/n) :=
by simp only [prob_of_α_error_le, err_exp_of_iid]
/-- Corollary 1 in Blahut1974 -/
theorem prob_of_β_error_le_of_iid [iid qₙ₁ q₁] [iid qₙ₂ q₂] :
β qₙ₁ qₙ₂ T ≤ exp(-n*r) := sorry
theorem log_prob_of_α_error_le :
log α qₙ₁ qₙ₂ T ≤ -err_exp qₙ₁ qₙ₂ r :=
by simp_only [prob_of_α_error_le]
theorem log_prob_of_α_error_le_of_iid [iid qₙ₁ q₁] [iid qₙ₂ q₂] :
log α qₙ₁ qₙ₂ T ≤ -n * err_exp q₁ q₂ (r/n) :=
by simp_only [log_prob_of_α_error_le, err_exp_of_iid]
theorem limsup_prob_of_α_error_le [iid qₙ₁ q₁] [iid qₙ₂ q₂] :
limsup at_top λ n, 1/n * log α qₙ₁ qₙ₂ T ≤ - discrimination q₁ q₂ :=
by simp_only [log_prob_of_α_error_le_of_iid]
theorem lim_log_prob_of_α_error {r > 0} [iid qₙ₁ q₁] [iid qₙ₂ q₂] :
∀ γ > 0,
lim at_top λ n, 1/n * log α qₙ₁ qₙ₂ T = - discrimination q₁ q₂ :=
by simp only [liminf_log_prob_of_α_error_ge, limsup_prob_of_α_error_le]
variable [T = r - err_exp qₙ₁ qₙ₂ r]
variables [δ > 0]
(qₙ : (ℕ → ι) → ℝ) [rnd_var n qₙ] [achieves_err_exp qₙ δ]
lemma T_range :
T ∈ [- discrimination qₙ₁ qₙ₂, discrimination qₙ₁ qₙ₂] := sorry
def U₁ (ε : ℝ) : { k | exp(-ε) < (qₙ₂ k / qₙ k) * exp(err_exp qₙ₁ qₙ₂ r) ≤ (qₙ₁ k / qₙ k) * exp(r) }
def U₂ (ε : ℝ) = { k | exp(-ε) < (qₙ₁ k / qₙ k) * exp(r) ≤ (qₙ₂ k / qₙ k) * exp(err_exp qₙ₁ qₙ₂ r) }
def UA (ε : ℝ) = { k | log(qₙ k / qₙ₁ k) < err_exp qₙ₁ qₙ₂ r + ε }
def UB (ε : ℝ) = { k | log(qₙ k / qₙ₂ k) < r + ε }
-- the ε comes from the achieves_err_exp hypothesis
lemma U₁_subset_U : U₁(ε) ⊂ U(err_exp qₙ₁ qₙ₂ r - r) :=
begin
-- k ∈ U₁ ↔ (qₙ₁ k) * exp(r) ≥ (qₙ₂ k) * exp(err_exp qₙ₁ qₙ₂ r) > exp(-ε) * (qₙ k)
-- k ∈ U ↔ (qₙ₁ k) * exp(r) ≥ (qₙ₂ k) * exp(err_exp qₙ₁ qₙ₂ r)
end
lemma U₂_subset_Uᶜ : U₂(ε) ⊂ Uᶜ(err_exp qₙ₁ qₙ₂ r - r) :=
begin
-- k ∈ U₂ ↔ (qₙ₂ k) * exp(err_exp qₙ₁ qₙ₂ r - r) ≥ (qₙ₁ k) > (qₙ k) * exp(-(ε + r))
-- k ∈ Uᶜ ↔ (qₙ₂ k) * exp(err_exp qₙ₁ qₙ₂ r - r) > (qₙ₁ k)
end
lemma in_U₁_qₙ {k} :
k ∈ U₁(ε) → (qₙ₂ k) * exp(err_exp qₙ₁ qₙ₂ r) > exp(-ε) * (qₙ k) := by rw U₁
/-- page 409 -/
lemma prob_error_α_ge_prob_in_U₁ :
α qₙ₁ qₙ₂ T ≥ exp(-(ε + err_exp qₙ₁ qₙ₂ r)) * ∑ k ∈ U₁(ε), (qₙ k) :=
begin
calc α qₙ₁ qₙ₂ T = ∑ k ∈ U T, (qₙ₂ k) : by rw α,
... ≥ ∑ k ∈ U₁(ε), (qₙ₂ k) : by simp only [U₁_subset_U]
... ≥ ∑ k ∈ U₁(ε), (qₙ k) * exp(-(ε + err_exp qₙ₁ qₙ₂ r)) : by simp only [in_U₁_qₙ]
... = exp(-(ε + err_exp qₙ₁ qₙ₂ r)) * ∑ k ∈ U₁(ε), (qₙ k) : by simp,
end
/-- page 409 -/
lemma prob_error_β_ge_prob_in_U₂ :
β qₙ₁ qₙ₂ T ≥ exp(-(ε + r)) * ∑ k ∈ U₂(ε), (qₙ k) := sorry -- U₂_subset_Uᶜ
/-- page 409 -/
lemma U₁unionU₂_eq_UAinterUB :
U₁(ε) ∪ U₂(ε) = UA ∩ UB :=
begin
-- k ∈ U₁ ↔ (qₙ₁ k) * exp(r) ≥ (qₙ₂ k) * exp(err_exp qₙ₁ qₙ₂ r) > exp(-ε) * (qₙ k)
-- k ∈ U₂ ↔ (qₙ₂ k) * exp(err_exp qₙ₁ qₙ₂ r - r) ≥ (qₙ₁ k) > (qₙ k) * exp(-(ε + r))
-- k ∈ UA ↔ (qₙ₁ k) > (qₙ k) * exp(-(err_exp qₙ₁ qₙ₂ r + ε))
-- k ∈ UB ↔ (qₙ₂ k) > (qₙ k) * exp(-(r + ε))
end
lemma prob_in_UAinterUB_ge :
(∑ k ∈ UA ∩ UB, q k) ≥ 1 - (∑ k ∉ UA, q k) - (∑ k ∉ UB, q k) := sorry
/-- page 409 -/
def Uτ := { k | abs(log(qₙ k / qₙ₁ k * exp(-r))) ≥ ε }
/-- page 409 -/
lemma Uτ_rw :
Uτ = { k | abs(log(qₙ k / qₙ₁ k) - ∑ k, (qₙ k) * log(qₙ k / qₙ₁ k)) ≥ ε } := sorry
/-- if not in UA then in Uτ -/
lemma UAᶜ_subset_Uτ : UAᶜ ⊂ Uτ := sorry
variables [δ > 0] [rnd_var 1 q] [achieves_err_exp q δ]
variables (σ₁² σ₂² : ℝ)
[σ₁² = ∑ k, (q k) * log(q k / q₁ k)^2 - (∑ k, (q k) * log(q k / q₁ k))^2]
[σ₂² = ∑ k, (q k) * log(q k / q₂ k)^2 - (∑ k, (q k) * log(q k / q₂ k))^2]
example : ∑ k ∉ UA, (qₙ k) = 1 - ∑ k ∈ UA, (qₙ k) :=
by simp only [rnd_var.sum_probs_one]
lemma prob_not_in_UA_le : ∑ k ∉ UA, (qₙ k) ≤ σ₁²/ε^2 :=
begin
calc ∑ k ∉ UA, (qₙ k) ≤ ∑ k ∈ Uτ, (qₙ k) : by rw UAᶜ_subset_Uτ
... ≤ σ₁²/ε^2 : by sorry,
end
lemma prob_in_UA_ge :
1 - ∑ k ∈ UA, (qₙ k) ≤ σ₁²/ε^2 :=
by simp only [prob_not_in_UA_le]
lemma prob_not_in_UB_le :
∑ k ∉ UB, (qₙ k) ≤ σ₂²/ε^2 := sorry
lemma prob_not_in_UB_le_ :
1 - ∑ k ∈ UB, (qₙ k) ≤ σ₂²/ε^2 :=
by simp only [prob_not_in_UB_le]
lemma tmp :
∑ k ∈ UA, (qₙ k) + ∑ k ∈ UB, (qₙ k) - 1 ≥ 1 - σ₁²/ε^2 - σ₂²/ε^2 := sorry
lemma union_rw :
∑ k ∈ UA ∪ UB, (qₙ k) = ∑ k ∈ UA, (qₙ k) + ∑ k ∈ UB, (qₙ k) := sorry
/-- page 409 top right -/
lemma prob_in_UA_and_UB_ge :
∑ k ∈ UA ∩ UB, (qₙ k) ≥ 1 - (σ₁² + σ₂²)/ε^2 := sorry
lemma prob_in_U₁_and_U₂_ge :
∑ k ∈ U₁ ∪ U₂, (qₙ k) ≥ 1 - (σ₁² + σ₂²)/ε^2 :=
by simp only [U₁unionU₂_eq_UAinterUB, prob_in_UA_and_UB_ge]
lemma prob_in_U₁_and_U₂_ge_rw :
∑ k ∈ U₁, (qₙ k) + ∑ k ∈ U₂, (qₙ k) ≥ 1 - (σ₁² + σ₂²)/ε^2 :=
by simp only [prob_in_UA_and_UB_ge]
lemma prob_in_U₂_ge_of_prob_in_U₁_le :
∑ k ∈ U₁(ε), (qₙ k) ≤ γ
→ ∑ k ∈ U₂(ε), (qₙ k) ≥ 1 - (σ₁² + σ₂²)/ε^2 - γ :=
begin
intro H,
calc ∑ k ∈ U₂, (qₙ k) ≥ 1 - (σ₁² + σ₂²)/ε^2 - ∑ k ∈ U₁, (qₙ k) : by simp only [prob_in_U₁_and_U₂_ge_rw]
... ≥ 1 - (σ₁² + σ₂²)/ε^2 - γ : by simp only [H],
end
/-- Corollary 2 in Blahut1974: prob of error at least as bad as ... -/
theorem prob_of_α_error_ge {r > 0} [iid qₙ₁ q₁] [iid qₙ₂ q₂] :
∀ ε > 0, ∀ γ > 0,
β qₙ₁ qₙ₂ T ≤ γ * exp(-n*(r + ε))
→ α qₙ₁ qₙ₂ T ≥ exp(-n*(err_exp q₁ q₂ r + ε)) * (1 - (σ₁² + σ₂²)/(n*ε^2) - γ) :=
begin
intros ε hε γ hγ hβ,
-- prob_error_β_ge_prob_in_U₂ : β qₙ₁ qₙ₂ T ≥ exp(-(ε + r)) * ∑ k ∈ U₂(ε), (qₙ k)
-- γ * exp(-n*(r + ε)) ≥ β qₙ₁ qₙ₂ (err_exp qₙ₁ qₙ₂ r)
-- γ * exp(-n*(r + ε)) ≥ exp(-n*(ε + r)) * ∑ k ∈ U₂(ε), (qₙ k)
-- γ ≥ ∑ k ∈ U₂(ε), (qₙ k)
-- apply prob_in_U₂_ge_of_prob_in_U₁_le to get ∑ k ∈ U₂(ε), (qₙ k) ≥ 1 - (σ₁² + σ₂²)/ε^2 - γ
calc α qₙ₁ qₙ₂ T ≥ exp(-(ε + err_exp qₙ₁ qₙ₂ r)) * ∑ k ∈ U₁(ε), (qₙ k) : by rw prob_error_α_ge_prob_in_U₁
... ≥ exp(-(ε + err_exp qₙ₁ qₙ₂ r)) * (...) : by sorry
end
theorem log_prob_of_α_error_ge {r > 0} [iid qₙ₁ q₁] [iid qₙ₂ q₂] :
∀ ε > 0, ∀ γ > 0,
β qₙ₁ qₙ₂ (err_exp qₙ₁ qₙ₂ r) ≤ γ * exp(-n*(r + ε))
→ 1/n * log α qₙ₁ qₙ₂ T ≥ -(err_exp q₁ q₂ r/n + ε) + log(1 - (σ₁² + σ₂²)/(n*ε^2) - γ) / n :=
by simp only [prob_of_α_error_ge]
/-- "Tightness exists when the number of measurements is large" -/
theorem liminf_log_prob_of_α_error_ge {r > 0} [iid qₙ₁ q₁] [iid qₙ₂ q₂] :
∀ ε > 0, ∀ γ > 0,
β qₙ₁ qₙ₂ (err_exp qₙ₁ qₙ₂ r) ≤ γ * exp(-n*(r + ε))
→ liminf at_top λ n, 1/n * log α qₙ₁ qₙ₂ T ≥ - discrimination q₁ q₂ - ε :=
by simp only [log_prob_of_α_error_ge]
variables (σ² : ℝ)
[σ² = ∑ k, (qₙ₂ k) * log(qₙ₂ k / qₙ₁ k)^2 - (∑ k, (qₙ₂ k) * log(qₙ₂ k / qₙ₁ k))^2]
def c := discrimination q₂ q₁
/-- Theorem 11 in Blahut1974: if r > c then "α error prob approaches 1 with n" -/
theorem α_error_worse_for_large_n_of_greedy_region {r : ℝ} [iid qₙ₁ q₁] [iid qₙ₂ q₂] :
r > c ∧ β qₙ₁ qₙ₂ T ≤ exp(-n*r)
→ α qₙ₁ qₙ₂ T > 1 - 4*σ²/(n*(r-c)^2) - exp(-n*(r-c)/2) := sorry
-
Applications:
-
If the output of the source is a Quantum state then we Distinguish quantum states;
-
choosing T corresponds to choosing a test A.
-