Distinguish probability distributions

A source that outputs symbols. With prob dist q. Makes symbol k.

Is the prob dist q₁ or q₂?

We ask the source to make a symbol. Then we guess the probability distribution.

import Random_variable

variables (qₙ₁ qₙ₂ : (ℕ → ι) → ℝ) [rnd_var n qₙ₁] [rnd_var n qₙ₂]

/-
When might we guess q₁ for a given symbol k?
When the prob of k occuring according to q₁ is large.
-/

/-- if output symbol is one of these, then guess q₁ -/
def U (T : ℝ) := { k | (qₙ₁ k) ≥ (qₙ₂ k) * exp T }

-- Note: k is a sequence of symbols. It can have len 1 or more.

-- rewrite
example : U (T : ℝ) = { k | log(qₙ₁ k / qₙ₂ k) ≥ T }

/- What if T was a function of k? A third prob dist maybe? Can we guess better? -/

Let the source be a 6 faced die.

Let q₁ := {'1': 1, '2': 0, '3': 0, '4': 0, '5': 0, '6': 0}

Let q₂ := {'1': 0, '2': 0, '3': 0, '4': 0, '5': 0, '6': 1}
U (T : ℝ) := { k | (q₁ k) ≥ (q₂ k) * exp T }

= {1, 2, 3, 4, 5}
We roll the die and get 1

uhhh, q₁?
We roll the die and get 1

uhhh, q₁? correct!
We roll the die and get 6

q₂?
We roll the die and get 6

q₂? yes!
Let q₁ := {'1': 7/8, '2': 0, '3': 0, '4': 0, '5': 0, '6': 1/8}

Let q₂ := {'1': 1/8, '2': 0, '3': 0, '4': 0, '5': 0, '6': 7/8}
U (T : ℝ) := { k | (q₁ k) ≥ (q₂ k) * exp T }

= {1, 2, 3, 4, 5} for exp T = 1/2
the die can be q₁ and produce a symbol k ∉ U T, like 6

in this case our strategy fails. And we guess q₂ incorrectly.

what's the probability of this happening?

/--  Prob. of error: guess q₁ when actually q₂ -/
def α (T : ℝ) : ℝ := ∑ k ∈ (U T), (qₙ₂ k)

/-- Prob. of error: guess q₂ when actually q₁ -/
def β (T : ℝ) : ℝ := ∑ k ∉ (U T), (qₙ₁ k)

we want α and β as small as possible


How to choose U T?


If we reduce U T then we reduce α


If we reduce U T then we increase β

theorem trade_off (T₁ T₂ : R) :
β qₙ₁ qₙ₂ T₁ < β qₙ₁ qₙ₂ T₂
→ α qₙ₁ qₙ₂ T₁ > α qₙ₁ qₙ₂ T₂ := sorry

🤔

The Neyman-Pearson approach:


The Neyman-Pearson approach:

Assume β ≤ βₘ.
The Neyman-Pearson approach:

Assume β ≤ βₘ. Then minimize α.
import Relative_entropy

/-- How close to q₂ can we get if we stay ε-close to q₁? -/
def err_exp (ε : ℝ) :=
min { b : ℝ | ∀ qₙ, b = discrimination qₙ qₙ₂ ∧ discrimination qₙ₁ qₙ ≤ ε }

example : err_exp qₙ₁ qₙ₂ 0 = discrimination qₙ₁ qₙ₂ := sorry

lemma lim_err_exp {ε : ℝ} :
lim at_top λ n, err_exp qₙ₁ qₙ₂ ε/n = discrimination qₙ₁ qₙ₂ := by sorry

/-- can get δ-close to q₂ when ε-close to q₁ -/
def achieves_err_exp [rnd_var n qₙ] [discrimination qₙ₁ qₙ ≤ ε] (δ : ℝ) :=
discrimination qₙ qₙ₂ ≥ δ

/-- not closer to q₂ if closer to q₁ -/
lemma err_exp_nonincreasing :
ε > ε' → err_exp qₙ₁ qₙ₂ ε ≤ err_exp qₙ₁ qₙ₂ ε' := sorry

variables [iid qₙ q]

lemma err_exp_of_iid [iid qₙ₁ q₁] [iid qₙ₂ q₂] :
err_exp qₙ₁ qₙ₂ ε = n * err_exp q₁ q₂ (ε/n) :=
begin
  calc err_exp qₙ₁ qₙ₂ ε = min { discrimination qₙ qₙ₂ | discrimination qₙ₁ qₙ ≤ ε } : by rw err_exp
            ... = min { n * discrimination q q₂ | n * discrimination q₁ q ≤ ε } : by rw discrimination_additive_of_iid
            ... = min { n * discrimination q q₂ | discrimination q₁ q ≤ ε/n } : by simp
            ... = n *  min { discrimination q q₂ | discrimination q₁ q ≤ ε/n } : by simp
            ... = n * err_exp q₁ q₂ (ε/n) : by rw err_exp,
end
def ψ (s : ℝ) := log( ∑ k, (qₙ₁ k)^s * ((qₙ₁ k) * (qₙ₂ k))^(1 / (1+s)) )^(1+s)

def φ (s : ℝ) (r : ℝ) := - s * r - (ψ s)

/-
"tangent to err_exp r at some s

and otherwise below it since err_exp is convex"
-/

example : d/ds ψ(s)|0 = discrimination qₙ₁ qₙ₂ := sorry

theorem err_exp_rw : 
err_exp qₙ₁ qₙ₂ r = max { φ s r | s ≥ 0 } := sorry
variables (r : ℝ) [r > 0]
(T : ℝ) [T = err_exp qₙ₁ qₙ₂ r - r] -- consider this partition

/-- Theorem 9 in Blahut1974: larger r → smaller β -/
theorem prob_of_β_error_le : β qₙ₁ qₙ₂ T ≤ exp(-r) :=
begin
  /- Let s parametrize the point (r, err_exp qₙ₁ qₙ₂ r) -/
  calc β qₙ₁ qₙ₂ T = ∑ k ∉ U T, qₙ₁ k : by sorry
             ... ≤ ∑ k, (qₙ₁ k) * (qₙ₂ k / qₙ₁ k)^(1/(1+s)) * exp(T/(1+s))) : by sorry
             ... = exp(-r) : by sorry, 
end

/--
smaller error α if q's r-close to q₁ are farther from q₂

Theorem 9 in Blahut1974
-/
theorem prob_of_α_error_le :
α qₙ₁ qₙ₂ T ≤ exp(-err_exp qₙ₁ qₙ₂ r) :=
begin
  /- Let s parametrize the point (r, err_exp r) -/
  calc α qₙ₁ qₙ₂ T = ∑ k ∈ U T, qₙ₂ k : by sorry
             ... ≤ ∑ k, (qₙ₂ k) * (qₙ₁ k / qₙ₂ k)^(s/(1+s)) * exp(-T*s/(1+s))) : by sorry
             ... = exp(-err_exp qₙ₁ qₙ₂ r) : by sorry,
end

/-- Corollary 1 in Blahut1974 -/
theorem prob_of_α_error_le_of_iid [iid qₙ₁ q₁] [iid qₙ₂ q₂] :
α qₙ₁ qₙ₂ T ≤ exp(-n * err_exp q₁ q₂ r/n) :=
by simp only [prob_of_α_error_le, err_exp_of_iid]

/-- Corollary 1 in Blahut1974 -/
theorem prob_of_β_error_le_of_iid [iid qₙ₁ q₁] [iid qₙ₂ q₂] :
β qₙ₁ qₙ₂ T ≤ exp(-n*r) := sorry

theorem log_prob_of_α_error_le :
log α qₙ₁ qₙ₂ T ≤ -err_exp qₙ₁ qₙ₂ r :=
by simp_only [prob_of_α_error_le]

theorem log_prob_of_α_error_le_of_iid [iid qₙ₁ q₁] [iid qₙ₂ q₂] :
log α qₙ₁ qₙ₂ T ≤ -n * err_exp q₁ q₂ (r/n) :=
by simp_only [log_prob_of_α_error_le, err_exp_of_iid]

theorem limsup_prob_of_α_error_le [iid qₙ₁ q₁] [iid qₙ₂ q₂] :
limsup at_top λ n, 1/n * log α qₙ₁ qₙ₂ T ≤ - discrimination q₁ q₂ :=
by simp_only [log_prob_of_α_error_le_of_iid]
theorem lim_log_prob_of_α_error {r > 0} [iid qₙ₁ q₁] [iid qₙ₂ q₂] : 
∀ γ > 0,
lim at_top λ n, 1/n * log α qₙ₁ qₙ₂ T = - discrimination q₁ q₂ :=
by simp only [liminf_log_prob_of_α_error_ge, limsup_prob_of_α_error_le]
variable [T = r - err_exp qₙ₁ qₙ₂ r]

variables [δ > 0] 
(qₙ : (ℕ → ι) → ℝ) [rnd_var n qₙ] [achieves_err_exp qₙ δ]

lemma T_range :
T ∈ [- discrimination qₙ₁ qₙ₂, discrimination qₙ₁ qₙ₂] := sorry

def U₁ (ε : ℝ) : { k | exp(-ε) < (qₙ₂ k / qₙ k) * exp(err_exp qₙ₁ qₙ₂ r) ≤ (qₙ₁ k / qₙ k) * exp(r) }

def U₂ (ε : ℝ) = { k | exp(-ε) < (qₙ₁ k / qₙ k) * exp(r) ≤ (qₙ₂ k / qₙ k) * exp(err_exp qₙ₁ qₙ₂ r) }

def UA (ε : ℝ) = { k | log(qₙ k / qₙ₁ k) <  err_exp qₙ₁ qₙ₂ r + ε }

def UB (ε : ℝ) = { k | log(qₙ k / qₙ₂ k) <  r + ε }

-- the ε comes from the achieves_err_exp hypothesis

lemma U₁_subset_U : U₁(ε) ⊂ U(err_exp qₙ₁ qₙ₂ r - r) :=
begin
  -- k ∈ U₁ ↔ (qₙ₁ k) * exp(r) ≥ (qₙ₂ k) * exp(err_exp qₙ₁ qₙ₂ r) > exp(-ε) * (qₙ k)
  -- k ∈ U  ↔ (qₙ₁ k) * exp(r) ≥ (qₙ₂ k) * exp(err_exp qₙ₁ qₙ₂ r)
end

lemma U₂_subset_Uᶜ : U₂(ε) ⊂ Uᶜ(err_exp qₙ₁ qₙ₂ r - r) :=
begin
  -- k ∈ U₂ ↔ (qₙ₂ k) * exp(err_exp qₙ₁ qₙ₂ r - r) ≥ (qₙ₁ k) > (qₙ k) * exp(-(ε + r))
  -- k ∈ Uᶜ ↔ (qₙ₂ k) * exp(err_exp qₙ₁ qₙ₂ r - r) > (qₙ₁ k)
end
lemma in_U₁_qₙ {k} :
k ∈ U₁(ε) → (qₙ₂ k) * exp(err_exp qₙ₁ qₙ₂ r) > exp(-ε) * (qₙ k) := by rw U₁

/-- page 409 -/
lemma prob_error_α_ge_prob_in_U₁ :
α qₙ₁ qₙ₂ T ≥  exp(-(ε + err_exp qₙ₁ qₙ₂ r)) * ∑ k ∈ U₁(ε), (qₙ k) :=
begin
  calc α qₙ₁ qₙ₂ T = ∑ k ∈ U T, (qₙ₂ k) : by rw α,
             ... ≥ ∑ k ∈ U₁(ε), (qₙ₂ k) : by simp only [U₁_subset_U]
             ... ≥ ∑ k ∈ U₁(ε), (qₙ k) * exp(-(ε + err_exp qₙ₁ qₙ₂ r)) : by simp only [in_U₁_qₙ]
             ... = exp(-(ε + err_exp qₙ₁ qₙ₂ r)) * ∑ k ∈ U₁(ε), (qₙ k) : by simp,
end

/-- page 409 -/
lemma prob_error_β_ge_prob_in_U₂ :
β qₙ₁ qₙ₂ T ≥ exp(-(ε + r)) * ∑ k ∈ U₂(ε), (qₙ k) := sorry  -- U₂_subset_Uᶜ
/-- page 409 -/
lemma U₁unionU₂_eq_UAinterUB :
U₁(ε) ∪ U₂(ε) = UA ∩ UB :=
begin
  -- k ∈ U₁ ↔ (qₙ₁ k) * exp(r) ≥ (qₙ₂ k) * exp(err_exp qₙ₁ qₙ₂ r) > exp(-ε) * (qₙ k)
  -- k ∈ U₂ ↔ (qₙ₂ k) * exp(err_exp qₙ₁ qₙ₂ r - r) ≥ (qₙ₁ k) > (qₙ k) * exp(-(ε + r))
  -- k ∈ UA ↔ (qₙ₁ k) > (qₙ k) * exp(-(err_exp qₙ₁ qₙ₂ r + ε))
  -- k ∈ UB ↔ (qₙ₂ k) > (qₙ k) * exp(-(r + ε)) 
end

lemma prob_in_UAinterUB_ge : 
(∑ k ∈ UA ∩ UB, q k) ≥ 1 - (∑ k ∉ UA, q k) - (∑ k ∉ UB, q k) := sorry

/-- page 409 -/
def Uτ := { k | abs(log(qₙ k / qₙ₁ k * exp(-r))) ≥ ε }

/-- page 409 -/
lemma Uτ_rw :
Uτ = { k | abs(log(qₙ k / qₙ₁ k) - ∑ k, (qₙ k) * log(qₙ k / qₙ₁ k)) ≥ ε } := sorry

/-- if not in UA then in Uτ -/
lemma UAᶜ_subset_Uτ : UAᶜ ⊂ Uτ := sorry
variables [δ > 0] [rnd_var 1 q] [achieves_err_exp q δ]

variables (σ₁² σ₂² : ℝ) 
[σ₁² = ∑ k, (q k) * log(q k / q₁ k)^2 - (∑ k, (q k) * log(q k / q₁ k))^2]
[σ₂² = ∑ k, (q k) * log(q k / q₂ k)^2 - (∑ k, (q k) * log(q k / q₂ k))^2]

example : ∑ k ∉ UA, (qₙ k) = 1 - ∑ k ∈ UA, (qₙ k) :=
by simp only [rnd_var.sum_probs_one]

lemma prob_not_in_UA_le : ∑ k ∉ UA, (qₙ k) ≤ σ₁²/ε^2 :=
begin
  calc ∑ k ∉ UA, (qₙ k) ≤ ∑ k ∈ Uτ, (qₙ k) : by rw UAᶜ_subset_Uτ
                   ... ≤ σ₁²/ε^2 : by sorry,
end

lemma prob_in_UA_ge :
1 - ∑ k ∈ UA, (qₙ k) ≤ σ₁²/ε^2 :=
by simp only [prob_not_in_UA_le]

lemma prob_not_in_UB_le :
∑ k ∉ UB, (qₙ k) ≤ σ₂²/ε^2 := sorry

lemma prob_not_in_UB_le_ :
1 - ∑ k ∈ UB, (qₙ k) ≤ σ₂²/ε^2 :=
by simp only [prob_not_in_UB_le]

lemma tmp :
∑ k ∈ UA, (qₙ k) + ∑ k ∈ UB, (qₙ k) - 1 ≥ 1 - σ₁²/ε^2 - σ₂²/ε^2 := sorry

lemma union_rw :
∑ k ∈ UA ∪ UB, (qₙ k) = ∑ k ∈ UA, (qₙ k) + ∑ k ∈ UB, (qₙ k) := sorry

/-- page 409 top right -/
lemma prob_in_UA_and_UB_ge :
∑ k ∈ UA ∩ UB, (qₙ k) ≥ 1 - (σ₁² + σ₂²)/ε^2 := sorry

lemma prob_in_U₁_and_U₂_ge :
∑ k ∈ U₁ ∪ U₂, (qₙ k) ≥ 1 - (σ₁² + σ₂²)/ε^2 :=
by simp only [U₁unionU₂_eq_UAinterUB, prob_in_UA_and_UB_ge]

lemma prob_in_U₁_and_U₂_ge_rw :
∑ k ∈ U₁, (qₙ k) + ∑ k ∈ U₂, (qₙ k) ≥ 1 - (σ₁² + σ₂²)/ε^2 :=
by simp only [prob_in_UA_and_UB_ge]

lemma prob_in_U₂_ge_of_prob_in_U₁_le :
∑ k ∈ U₁(ε), (qₙ k) ≤ γ
→ ∑ k ∈ U₂(ε), (qₙ k) ≥ 1 - (σ₁² + σ₂²)/ε^2 - γ :=
begin
  intro H,
  calc ∑ k ∈ U₂, (qₙ k) ≥ 1 - (σ₁² + σ₂²)/ε^2 - ∑ k ∈ U₁, (qₙ k) : by simp only [prob_in_U₁_and_U₂_ge_rw]
                    ... ≥ 1 - (σ₁² + σ₂²)/ε^2 - γ : by simp only [H],
end

/-- Corollary 2 in Blahut1974: prob of error at least as bad as ... -/
theorem prob_of_α_error_ge {r > 0} [iid qₙ₁ q₁] [iid qₙ₂ q₂] : 
∀ ε > 0, ∀ γ > 0,
β qₙ₁ qₙ₂ T ≤ γ * exp(-n*(r + ε))
→ α qₙ₁ qₙ₂ T ≥ exp(-n*(err_exp q₁ q₂ r + ε)) * (1 - (σ₁² + σ₂²)/(n*ε^2) - γ) :=
begin
  intros ε hε γ hγ hβ,

  -- prob_error_β_ge_prob_in_U₂ : β qₙ₁ qₙ₂ T ≥ exp(-(ε + r)) * ∑ k ∈ U₂(ε), (qₙ k)

  -- γ * exp(-n*(r + ε)) ≥ β qₙ₁ qₙ₂ (err_exp qₙ₁ qₙ₂ r)

  -- γ * exp(-n*(r + ε)) ≥ exp(-n*(ε + r)) * ∑ k ∈ U₂(ε), (qₙ k)

  -- γ                   ≥                   ∑ k ∈ U₂(ε), (qₙ k)
  
  -- apply prob_in_U₂_ge_of_prob_in_U₁_le to get ∑ k ∈ U₂(ε), (qₙ k) ≥ 1 - (σ₁² + σ₂²)/ε^2 - γ

  calc α qₙ₁ qₙ₂ T ≥  exp(-(ε + err_exp qₙ₁ qₙ₂ r)) * ∑ k ∈ U₁(ε), (qₙ k) : by rw prob_error_α_ge_prob_in_U₁
              ... ≥  exp(-(ε + err_exp qₙ₁ qₙ₂ r)) * (...) : by sorry
end

theorem log_prob_of_α_error_ge {r > 0} [iid qₙ₁ q₁] [iid qₙ₂ q₂] : 
∀ ε > 0, ∀ γ > 0,
β qₙ₁ qₙ₂ (err_exp qₙ₁ qₙ₂ r) ≤ γ * exp(-n*(r + ε))
→ 1/n * log α qₙ₁ qₙ₂ T ≥ -(err_exp q₁ q₂ r/n + ε) + log(1 - (σ₁² + σ₂²)/(n*ε^2) - γ) / n :=
by simp only [prob_of_α_error_ge]

/-- "Tightness exists when the number of measurements is large" -/
theorem liminf_log_prob_of_α_error_ge {r > 0} [iid qₙ₁ q₁] [iid qₙ₂ q₂] : 
∀ ε > 0, ∀ γ > 0,
β qₙ₁ qₙ₂ (err_exp qₙ₁ qₙ₂ r) ≤ γ * exp(-n*(r + ε))
→ liminf at_top λ n, 1/n * log α qₙ₁ qₙ₂ T ≥ - discrimination q₁ q₂ - ε :=
by simp only [log_prob_of_α_error_ge]

variables (σ² : ℝ)
[σ² = ∑ k, (qₙ₂ k) * log(qₙ₂ k / qₙ₁ k)^2 - (∑ k, (qₙ₂ k) * log(qₙ₂ k / qₙ₁ k))^2]

def c := discrimination q₂ q₁

/-- Theorem 11 in Blahut1974: if r > c then "α error prob approaches 1 with n" -/
theorem α_error_worse_for_large_n_of_greedy_region {r : ℝ} [iid qₙ₁ q₁] [iid qₙ₂ q₂] :
r > c ∧ β qₙ₁ qₙ₂ T ≤ exp(-n*r)
→ α qₙ₁ qₙ₂ T > 1 - 4*σ²/(n*(r-c)^2) - exp(-n*(r-c)/2) := sorry


References