{
  "title": "Q-Learning Mastery: 100 MCQs",
  "description": "A comprehensive set of 100 multiple-choice questions on Q-Learning, covering core concepts, implementation details, and practical scenarios.",
  "questions": [
    {
      "id": 1,
      "questionText": "Q-Learning is an example of which type of reinforcement learning?",
      "options": [
        "On-policy learning",
        "Off-policy learning",
        "Supervised learning",
        "Unsupervised learning"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Q-Learning is off-policy because it learns the optimal action-value function regardless of the agent’s current policy."
    },
    {
      "id": 2,
      "questionText": "In Q-Learning, the Q-value represents:",
      "options": [
        "Expected cumulative reward for a state-action pair",
        "Discount factor",
        "Immediate reward only",
        "Next state value"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Q(s,a) estimates the total expected future reward starting from state s and taking action a."
    },
    {
      "id": 3,
      "questionText": "The Q-Learning update rule uses which key component?",
      "options": [
        "Random action selection",
        "Current policy only",
        "Max Q value of next state",
        "Immediate reward only"
      ],
      "correctAnswerIndex": 2,
      "explanation": "The max operator selects the best possible future action to update the current Q-value."
    },
    {
      "id": 4,
      "questionText": "Which equation represents the Q-Learning update?",
      "options": [
        "V(s) ← r only",
        "Q(s,a) ← Q(s,a) + α[r + γ max Q(s’,a’) − Q(s,a)]",
        "Policy π(s) ← π(s) + α",
        "TD error δ = r − V(s)"
      ],
      "correctAnswerIndex": 1,
      "explanation": "This standard Q-Learning formula updates Q-values based on observed reward and the estimated optimal future value."
    },
    {
      "id": 5,
      "questionText": "The learning rate α in Q-Learning controls:",
      "options": [
        "How much new information overrides old Q-values",
        "Exploration probability",
        "Discount of future rewards",
        "Reward shaping"
      ],
      "correctAnswerIndex": 0,
      "explanation": "α determines how quickly the Q-values are updated using new information."
    },
    {
      "id": 6,
      "questionText": "The discount factor γ in Q-Learning affects:",
      "options": [
        "Learning rate",
        "Importance of future rewards",
        "Immediate reward only",
        "Exploration strategy"
      ],
      "correctAnswerIndex": 1,
      "explanation": "γ weights future rewards relative to immediate rewards, controlling short-term vs long-term focus."
    },
    {
      "id": 7,
      "questionText": "Which exploration strategy is commonly used in Q-Learning?",
      "options": [
        "Policy gradient",
        "Softmax",
        "Random selection",
        "ε-greedy"
      ],
      "correctAnswerIndex": 3,
      "explanation": "ε-greedy balances exploration of new actions and exploitation of best-known actions."
    },
    {
      "id": 8,
      "questionText": "Q-Learning is best suited for:",
      "options": [
        "Continuous action spaces only",
        "Supervised classification",
        "Discrete action spaces",
        "Clustering problems"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Classic Q-Learning assumes a finite set of actions for tabular updates."
    },
    {
      "id": 9,
      "questionText": "Which component is not part of Q-Learning?",
      "options": [
        "Policy gradient",
        "Action",
        "State",
        "Reward"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Q-Learning does not directly use policy gradients; it learns optimal Q-values."
    },
    {
      "id": 10,
      "questionText": "Q-Learning converges to the optimal Q-values if:",
      "options": [
        "The agent explores randomly only once",
        "All state-action pairs are visited infinitely often and learning rate decays appropriately",
        "Immediate reward is always positive",
        "Discount factor is zero"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Convergence requires sufficient exploration and proper decay of the learning rate."
    },
    {
      "id": 11,
      "questionText": "The 'max' operator in Q-Learning is used to:",
      "options": [
        "Compute immediate reward",
        "Adjust learning rate",
        "Select the best next action value for update",
        "Randomize Q-values"
      ],
      "correctAnswerIndex": 2,
      "explanation": "max_a Q(s’,a’) selects the highest estimated return from the next state."
    },
    {
      "id": 12,
      "questionText": "Which type of reward signal does Q-Learning require?",
      "options": [
        "Policy gradients",
        "Scalar rewards",
        "State transitions only",
        "Vector rewards"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Q-Learning updates require a single scalar reward to calculate TD error."
    },
    {
      "id": 13,
      "questionText": "In tabular Q-Learning, the Q-table stores:",
      "options": [
        "Only action probabilities",
        "Q-values for all state-action pairs",
        "Only state values",
        "Immediate rewards"
      ],
      "correctAnswerIndex": 1,
      "explanation": "The Q-table maps every state-action combination to an estimated value."
    },
    {
      "id": 14,
      "questionText": "If the agent follows an ε-greedy policy, it:",
      "options": [
        "Selects the best action most of the time but explores randomly sometimes",
        "Updates Q-values without actions",
        "Always selects the action with highest Q-value",
        "Only explores randomly"
      ],
      "correctAnswerIndex": 0,
      "explanation": "ε-greedy balances exploitation and exploration for better learning."
    },
    {
      "id": 15,
      "questionText": "Q-Learning is considered off-policy because it:",
      "options": [
        "Updates values based on current policy only",
        "Learns the optimal Q-values independently of the policy being followed",
        "Requires supervised labels",
        "Uses Monte Carlo returns exclusively"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Off-policy learning allows using exploratory policy while learning optimal Q-values."
    },
    {
      "id": 16,
      "questionText": "Which scenario is suitable for Q-Learning?",
      "options": [
        "Gridworld navigation with discrete actions",
        "Principal component analysis",
        "Continuous robot control without discretization",
        "Unsupervised clustering"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Tabular Q-Learning works best in environments with discrete actions and states."
    },
    {
      "id": 17,
      "questionText": "Which condition may slow Q-Learning convergence?",
      "options": [
        "Low exploration and high learning rate",
        "Decaying learning rate",
        "Proper exploration and small learning rate",
        "Infinite state-action visits"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Insufficient exploration or unstable learning rates can slow convergence."
    },
    {
      "id": 18,
      "questionText": "In Q-Learning, what happens if α = 1?",
      "options": [
        "Discount factor becomes 0",
        "Agent ignores rewards",
        "Q-values are updated only based on latest observation, ignoring old values",
        "Learning rate is too slow"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Setting α=1 completely replaces old Q-values with new estimates."
    },
    {
      "id": 19,
      "questionText": "The TD error in Q-Learning is:",
      "options": [
        "Immediate reward minus zero",
        "V(s) − r",
        "δ = r + γ max Q(s’,a’) − Q(s,a)",
        "Policy gradient only"
      ],
      "correctAnswerIndex": 2,
      "explanation": "TD error measures difference between predicted and target Q-values."
    },
    {
      "id": 20,
      "questionText": "Which of the following is true about discount factor γ?",
      "options": [
        "γ < 0",
        "0 ≤ γ ≤ 1, controlling future reward importance",
        "γ > 1",
        "γ irrelevant for Q-Learning"
      ],
      "correctAnswerIndex": 1,
      "explanation": "γ controls how much future rewards are considered in Q-value updates."
    },
    {
      "id": 21,
      "questionText": "What happens if γ = 0 in Q-Learning?",
      "options": [
        "Exploration rate increases",
        "Learning stops",
        "Agent values long-term rewards equally",
        "Agent only considers immediate rewards"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Zero discount factor ignores future rewards, making agent myopic."
    },
    {
      "id": 22,
      "questionText": "Which is an advantage of Q-Learning?",
      "options": [
        "Works only for small state spaces",
        "Requires supervised labels",
        "Cannot handle stochastic rewards",
        "Can learn optimal policy without following it"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Off-policy learning allows Q-Learning to learn optimal values even with exploratory actions."
    },
    {
      "id": 23,
      "questionText": "What is the main limitation of tabular Q-Learning?",
      "options": [
        "Cannot learn from rewards",
        "Cannot use TD error",
        "Cannot handle discrete actions",
        "Does not scale to large or continuous state spaces"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Tabular storage becomes infeasible for large or continuous environments."
    },
    {
      "id": 24,
      "questionText": "Which type of learning does Q-Learning rely on?",
      "options": [
        "Supervised learning",
        "Reinforcement learning",
        "Self-supervised learning",
        "Unsupervised learning"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Q-Learning is a reinforcement learning algorithm using rewards to learn optimal actions."
    },
    {
      "id": 25,
      "questionText": "Which is an essential component for Q-Learning?",
      "options": [
        "Reward signal",
        "Feature scaling only",
        "Loss function gradient",
        "Cluster labels"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Q-Learning requires a scalar reward to update Q-values."
    },
    {
      "id": 26,
      "questionText": "Which aspect differentiates SARSA from Q-Learning?",
      "options": [
        "SARSA cannot learn",
        "Q-Learning uses supervised labels",
        "SARSA is on-policy, Q-Learning is off-policy",
        "SARSA ignores rewards"
      ],
      "correctAnswerIndex": 2,
      "explanation": "SARSA updates Q-values using the action actually taken (on-policy)."
    },
    {
      "id": 27,
      "questionText": "Which operator ensures Q-Learning selects best next action in value update?",
      "options": [
        "Average operator",
        "Min operator",
        "Random operator",
        "Max operator over next Q-values"
      ],
      "correctAnswerIndex": 3,
      "explanation": "max_a Q(s’,a’) chooses the highest estimated return for next state."
    },
    {
      "id": 28,
      "questionText": "Q-Learning can handle stochastic environments because:",
      "options": [
        "It ignores randomness",
        "It uses expected rewards over time",
        "It requires deterministic transitions",
        "It uses supervised labels"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Q-values converge to expected returns even when rewards or transitions are probabilistic."
    },
    {
      "id": 29,
      "questionText": "Which strategy balances exploration and exploitation in Q-Learning?",
      "options": [
        "Random selection only",
        "Pure greedy policy",
        "Policy gradient",
        "ε-greedy policy"
      ],
      "correctAnswerIndex": 3,
      "explanation": "ε-greedy allows occasional random actions to explore while usually exploiting best-known actions."
    },
    {
      "id": 30,
      "questionText": "What does convergence of Q-Learning mean?",
      "options": [
        "Learning rate increases infinitely",
        "Agent stops moving",
        "Rewards become zero",
        "Q-values approximate optimal values for all state-action pairs"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Convergence means the Q-table represents optimal expected returns, and the agent can act optimally."
    },
    {
      "id": 31,
      "questionText": "If an agent in a gridworld uses Q-Learning with γ=0.9, what does this imply?",
      "options": [
        "Only immediate reward matters",
        "Agent acts randomly",
        "Future rewards are important but slightly discounted",
        "Future rewards are ignored"
      ],
      "correctAnswerIndex": 2,
      "explanation": "A discount factor of 0.9 prioritizes long-term rewards while still considering immediate rewards."
    },
    {
      "id": 32,
      "questionText": "Which condition can cause Q-Learning to fail to converge?",
      "options": [
        "Low learning rate",
        "Sparse rewards",
        "Insufficient exploration of state-action space",
        "High discount factor"
      ],
      "correctAnswerIndex": 2,
      "explanation": "If some state-action pairs are never visited, Q-values for those pairs cannot converge."
    },
    {
      "id": 33,
      "questionText": "What is the purpose of decaying ε in ε-greedy policy?",
      "options": [
        "Increase randomness constantly",
        "Reduce exploration over time to favor exploitation",
        "Ignore exploration",
        "Stabilize learning rate"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Decaying ε gradually shifts agent behavior from exploration to exploitation as it learns."
    },
    {
      "id": 34,
      "questionText": "In a cliff-walking environment, Q-Learning might:",
      "options": [
        "Learn to avoid the cliff using negative rewards",
        "Receive only positive rewards",
        "Ignore cliffs completely",
        "Always fall off"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Negative rewards for falling off the cliff guide the agent to safe paths."
    },
    {
      "id": 35,
      "questionText": "In Q-Learning, increasing α too high can cause:",
      "options": [
        "Slow convergence",
        "Ignoring rewards",
        "Unstable learning and oscillating Q-values",
        "Reduced exploration"
      ],
      "correctAnswerIndex": 2,
      "explanation": "High learning rate can make Q-values change too abruptly and prevent convergence."
    },
    {
      "id": 36,
      "questionText": "Which environment property makes Q-Learning suitable?",
      "options": [
        "Continuous states only",
        "No reward signal",
        "Continuous actions only",
        "Discrete state and action space"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Tabular Q-Learning requires discrete states and actions to store Q-values."
    },
    {
      "id": 37,
      "questionText": "An agent receives noisy rewards. How does Q-Learning handle this?",
      "options": [
        "Ignores all rewards",
        "Updates only once",
        "Randomly resets Q-table",
        "Estimates expected Q-values over multiple updates"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Repeated updates average out noise, leading to stable Q-value estimates."
    },
    {
      "id": 38,
      "questionText": "What does the max operator in Q-Learning introduce that SARSA does not?",
      "options": [
        "Optimism about future rewards (off-policy)",
        "Exploration strategy",
        "Immediate reward only",
        "Policy gradients"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Q-Learning considers best possible next action regardless of the policy, making it off-policy."
    },
    {
      "id": 39,
      "questionText": "Which scenario requires function approximation in Q-Learning?",
      "options": [
        "Supervised datasets",
        "Small discrete environments",
        "Large state spaces where tabular storage is impractical",
        "Clustering tasks"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Function approximation (like neural networks) generalizes across states in large spaces."
    },
    {
      "id": 40,
      "questionText": "Which is true about convergence speed in Q-Learning?",
      "options": [
        "Faster with negative rewards only",
        "Faster with zero exploration",
        "Independent of learning rate",
        "Depends on learning rate, exploration, and reward structure"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Proper tuning of α, ε, and reward design affects how quickly Q-values converge."
    },
    {
      "id": 41,
      "questionText": "In a stochastic gridworld, Q-Learning can learn optimal actions because:",
      "options": [
        "It ignores transitions",
        "It only uses immediate reward",
        "It estimates expected Q-values over many episodes",
        "It does not update Q-values"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Averaging over multiple experiences accounts for stochasticity in transitions and rewards."
    },
    {
      "id": 42,
      "questionText": "What is the difference between Q-Learning and SARSA in terms of risk?",
      "options": [
        "Q-Learning may be more optimistic, SARSA is more conservative",
        "SARSA ignores rewards",
        "Q-Learning is on-policy",
        "SARSA ignores exploration"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Q-Learning assumes optimal next action; SARSA updates based on actual next action, making it safer in risky environments."
    },
    {
      "id": 43,
      "questionText": "Which combination of parameters can stabilize Q-Learning in noisy environments?",
      "options": [
        "High learning rate and zero exploration",
        "Ignore rewards",
        "Moderate learning rate and sufficient exploration",
        "Low discount factor and random policy"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Moderate α and proper exploration reduce oscillations in Q-values."
    },
    {
      "id": 44,
      "questionText": "When using Q-Learning with γ close to 1 in long-horizon tasks, the agent:",
      "options": [
        "Ignores future rewards",
        "Only explores randomly",
        "Focuses on long-term rewards",
        "Receives unstable rewards"
      ],
      "correctAnswerIndex": 2,
      "explanation": "High discount factor prioritizes cumulative rewards far into the future."
    },
    {
      "id": 45,
      "questionText": "In Q-Learning, why is it necessary to visit all state-action pairs?",
      "options": [
        "To update only visited states",
        "To ensure convergence to true optimal Q-values",
        "To decrease learning rate",
        "To ignore reward signals"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Without exploring all state-action pairs, Q-values for some states may never converge."
    },
    {
      "id": 46,
      "questionText": "Which technique can help Q-Learning in large state spaces?",
      "options": [
        "Random action selection only",
        "Tabular Q-values only",
        "Function approximation with neural networks",
        "Ignore exploration"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Approximation allows generalization to unseen states and reduces memory requirements."
    },
    {
      "id": 47,
      "questionText": "In Q-Learning, what is the impact of too small α?",
      "options": [
        "Oscillating Q-values",
        "Ignoring future rewards",
        "Faster convergence",
        "Slow learning"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Small learning rate updates Q-values slowly, making learning take longer."
    },
    {
      "id": 48,
      "questionText": "Which scenario demonstrates reward shaping in Q-Learning?",
      "options": [
        "Reward only at episode end",
        "Providing intermediate positive rewards for partial progress",
        "Ignore reward signal",
        "Random rewards"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Shaping rewards guide the agent step-by-step, improving convergence speed."
    },
    {
      "id": 49,
      "questionText": "Which factor determines how much Q-Learning values immediate vs future reward?",
      "options": [
        "Reward scaling only",
        "Discount factor γ",
        "Learning rate α",
        "Exploration ε"
      ],
      "correctAnswerIndex": 1,
      "explanation": "γ weighs future reward relative to immediate reward."
    },
    {
      "id": 50,
      "questionText": "An agent overestimates Q-values due to stochastic rewards. Which can help?",
      "options": [
        "Set γ=0",
        "Ignore rewards",
        "Use averaging or smoothing techniques",
        "Remove exploration"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Averaging reduces overestimation caused by random fluctuations in reward."
    },
    {
      "id": 51,
      "questionText": "Which advantage does Q-Learning have over Monte Carlo methods?",
      "options": [
        "Cannot handle stochastic rewards",
        "Can update Q-values before episode ends (online learning)",
        "Needs supervised labels",
        "Requires full episode only"
      ],
      "correctAnswerIndex": 1,
      "explanation": "TD update allows learning from partial sequences without waiting for episode completion."
    },
    {
      "id": 52,
      "questionText": "In an environment with many terminal states, Q-Learning:",
      "options": [
        "Requires supervised signal",
        "Still updates Q-values until convergence",
        "Fails to update",
        "Ignores terminal states"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Terminal states are treated as having zero future reward, allowing updates to proceed."
    },
    {
      "id": 53,
      "questionText": "In Q-Learning, which factor controls optimism in action selection?",
      "options": [
        "Discount factor γ only",
        "max operator over next Q-values",
        "Learning rate α only",
        "Exploration ε only"
      ],
      "correctAnswerIndex": 1,
      "explanation": "The max operator assumes the agent can always take the best next action, leading to optimistic estimates."
    },
    {
      "id": 54,
      "questionText": "Which property makes Q-Learning off-policy?",
      "options": [
        "Updates Q-values only for chosen action",
        "Updates Q-values using best possible action, not necessarily the action taken",
        "Requires reward shaping",
        "Uses Monte Carlo returns only"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Off-policy means learning optimal Q-values independently of current behavior policy."
    },
    {
      "id": 55,
      "questionText": "When using Q-Learning in a real robot, what challenge arises?",
      "options": [
        "Rewards cannot be negative",
        "Tabular Q-table is sufficient",
        "Immediate rewards only",
        "Large state-action space requiring function approximation"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Physical robots often have continuous states, necessitating approximation instead of tabular Q-tables."
    },
    {
      "id": 56,
      "questionText": "Which technique reduces overestimation bias in Q-Learning?",
      "options": [
        "Increase γ to 1",
        "Double Q-Learning",
        "Ignore exploration",
        "TD(0) only"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Double Q-Learning uses two value estimators to prevent overestimating max Q-values."
    },
    {
      "id": 57,
      "questionText": "Which factor helps avoid Q-Learning getting stuck in suboptimal policies?",
      "options": [
        "Only immediate reward",
        "Sufficient exploration (ε-greedy)",
        "Ignoring future rewards",
        "Zero learning rate"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Exploration ensures the agent discovers better actions over time."
    },
    {
      "id": 58,
      "questionText": "In Q-Learning, high variance in reward signals can be addressed by:",
      "options": [
        "Reducing discount factor to zero",
        "Averaging over multiple updates or smoothing Q-values",
        "Random exploration only",
        "Ignoring reward signals"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Smoothing reduces fluctuations from noisy reward signals, stabilizing learning."
    },
    {
      "id": 59,
      "questionText": "Which approach is needed when state space is continuous in Q-Learning?",
      "options": [
        "Function approximation using neural networks or tile coding",
        "Tabular Q-learning",
        "Immediate reward update only",
        "Random exploration only"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Continuous states cannot be stored in tables, so approximation methods are used."
    },
    {
      "id": 60,
      "questionText": "If an agent converges slowly, which adjustment can help?",
      "options": [
        "Ignore rewards",
        "Increase exploration and adjust learning rate",
        "Reduce discount factor to 0",
        "Remove ε-greedy strategy"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Proper tuning of exploration and learning rate accelerates convergence."
    },
    {
      "id": 61,
      "questionText": "Which scenario illustrates reward hacking in Q-Learning?",
      "options": [
        "Sparse rewards guide learning properly",
        "All Q-values remain zero",
        "Agent follows intended task exactly",
        "Agent exploits unintended behavior to get high reward without completing task"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Reward hacking happens when the agent finds loopholes in the reward function."
    },
    {
      "id": 62,
      "questionText": "In episodic tasks, Q-Learning updates:",
      "options": [
        "Only at episode end",
        "Randomly without rewards",
        "Only for terminal state",
        "After every step using TD update"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Q-values are updated after each step using temporal-difference (TD) update."
    },
    {
      "id": 63,
      "questionText": "Which combination stabilizes learning in stochastic, large environments?",
      "options": [
        "Zero learning rate",
        "Tabular Q-learning only",
        "Ignore reward signal",
        "Function approximation + proper α + sufficient exploration"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Function approximation generalizes across states and proper tuning ensures stable learning."
    },
    {
      "id": 64,
      "questionText": "Which action selection method ensures exploration in Q-Learning?",
      "options": [
        "Always greedy",
        "ε-greedy policy",
        "Random without Q-values",
        "Policy gradient only"
      ],
      "correctAnswerIndex": 1,
      "explanation": "ε-greedy policy randomly explores some actions to prevent getting stuck in suboptimal paths."
    },
    {
      "id": 65,
      "questionText": "High γ and delayed reward may cause:",
      "options": [
        "Faster random exploration",
        "Ignoring rewards",
        "Slow learning and dependency on accurate Q-values",
        "Immediate convergence"
      ],
      "correctAnswerIndex": 2,
      "explanation": "High γ emphasizes long-term reward, which requires careful learning over many steps."
    },
    {
      "id": 66,
      "questionText": "In Q-Learning, which factor prevents overestimation bias?",
      "options": [
        "No exploration",
        "Double Q-Learning",
        "Single Q-table always",
        "Zero discount factor"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Using two Q-tables reduces the risk of overestimating action values."
    },
    {
      "id": 67,
      "questionText": "What is the role of temporal-difference (TD) in Q-Learning?",
      "options": [
        "Random Q-value assignment",
        "Requires full episode",
        "Updates Q-values incrementally using observed rewards and estimated future Q-values",
        "Ignores rewards"
      ],
      "correctAnswerIndex": 2,
      "explanation": "TD allows learning online without waiting for episode completion."
    },
    {
      "id": 68,
      "questionText": "Which environment is challenging for tabular Q-Learning?",
      "options": [
        "Deterministic rewards",
        "High-dimensional continuous state spaces",
        "Single-step tasks",
        "Small discrete grids"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Large continuous spaces cannot store Q-values in tables; function approximation is required."
    },
    {
      "id": 69,
      "questionText": "What does convergence in Q-Learning indicate?",
      "options": [
        "Agent stops learning",
        "Rewards are zero",
        "Discount factor is ignored",
        "Q-values have stabilized near optimal values for all state-action pairs"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Convergence means Q-values reflect true expected returns, allowing optimal policy selection."
    },
    {
      "id": 70,
      "questionText": "Which is a limitation of standard Q-Learning?",
      "options": [
        "Does not scale to large or continuous spaces without function approximation",
        "Cannot handle stochastic rewards",
        "Cannot learn discrete actions",
        "Requires supervised labels"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Tabular Q-Learning is impractical for large or continuous environments; approximation methods are needed."
    },
    {
      "id": 71,
      "questionText": "An autonomous car uses Q-Learning to navigate traffic. If it repeatedly chooses a risky shortcut with occasional high reward but often crashes, how should the agent be adjusted?",
      "options": [
        "Reduce learning rate to zero",
        "Increase penalty for crashes and adjust ε-greedy to explore safer routes",
        "Ignore crashes and maximize reward",
        "Decrease discount factor to zero"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Penalizing crashes guides the agent to safer policies, and proper exploration ensures it finds optimal routes."
    },
    {
      "id": 72,
      "questionText": "A robot learns to pick objects using Q-Learning with sparse rewards only at completion. Learning is very slow. Which technique can help?",
      "options": [
        "Ignore rewards",
        "Decrease learning rate to zero",
        "Remove discount factor",
        "Reward shaping: provide intermediate rewards for partial progress"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Reward shaping accelerates learning by giving feedback during intermediate steps."
    },
    {
      "id": 73,
      "questionText": "In a stochastic gridworld, an agent sometimes receives higher rewards for wrong actions due to randomness. Which method reduces overestimation?",
      "options": [
        "Double Q-Learning with two value estimators",
        "Increase discount factor to 1",
        "Reduce learning rate to zero",
        "Single Q-table only"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Double Q-Learning mitigates overestimation by using two independent Q-value estimates."
    },
    {
      "id": 74,
      "questionText": "A Q-Learning agent is stuck in a local optimum due to greedy action selection. What can improve exploration?",
      "options": [
        "Use single greedy action forever",
        "Set learning rate to zero",
        "Increase ε in ε-greedy or use decaying exploration",
        "Ignore rewards"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Higher exploration probability allows discovering better actions outside local optimum."
    },
    {
      "id": 75,
      "questionText": "In a multi-agent Q-Learning environment, agents’ actions influence each other. Which is a challenge?",
      "options": [
        "Single agent updates suffice",
        "Discount factor becomes negative",
        "Rewards are ignored",
        "Non-stationary environment due to other agents learning simultaneously"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Other agents’ learning changes environment dynamics, making convergence harder."
    },
    {
      "id": 76,
      "questionText": "An agent using Q-Learning in a continuous state environment shows poor performance. Which approach is appropriate?",
      "options": [
        "Increase tabular Q-table size indefinitely",
        "Ignore exploration",
        "Set learning rate to zero",
        "Use function approximation (neural networks) to estimate Q-values"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Continuous states cannot be stored in tables; function approximation allows generalization."
    },
    {
      "id": 77,
      "questionText": "During Q-Learning in a complex maze, the agent repeatedly oscillates between two states. Which adjustment may help?",
      "options": [
        "Remove discount factor",
        "Increase ε to 1 permanently",
        "Decrease learning rate α slightly or adjust reward structure",
        "Ignore oscillations"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Oscillations indicate learning instability; careful tuning of α and reward shaping stabilizes learning."
    },
    {
      "id": 78,
      "questionText": "In Q-Learning for a trading bot, the agent overestimates future profits due to stochastic market fluctuations. Which strategy helps?",
      "options": [
        "Ignore stochasticity",
        "Double Q-Learning to reduce overestimation bias",
        "Single Q-table with γ=1",
        "Remove exploration"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Double Q-Learning separates action selection and evaluation to avoid bias from stochastic rewards."
    },
    {
      "id": 79,
      "questionText": "A drone uses Q-Learning to navigate and avoid obstacles. It receives frequent collisions initially. How can learning be stabilized?",
      "options": [
        "Assign strong negative rewards for collisions and gradually decay ε",
        "Ignore collisions",
        "Increase learning rate to 1",
        "Reduce discount factor to zero"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Penalizing unsafe actions combined with proper exploration encourages safer policies."
    },
    {
      "id": 80,
      "questionText": "An agent in Q-Learning receives inconsistent sensor readings affecting rewards. Which technique improves stability?",
      "options": [
        "Smooth Q-value updates using averaging or low learning rate",
        "Use single-step greedy policy",
        "Ignore rewards",
        "Set discount factor to zero"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Smoothing or smaller α reduces fluctuations caused by noisy feedback."
    },
    {
      "id": 81,
      "questionText": "In a delivery robot scenario, the agent reaches destination quickly but takes unsafe shortcuts. What should be modified in Q-Learning?",
      "options": [
        "Set γ=0",
        "Reduce learning rate to zero",
        "Increase penalty for unsafe actions and adjust reward structure",
        "Ignore unsafe actions"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Adjusting penalties and rewards guides agent to safer optimal paths."
    },
    {
      "id": 82,
      "questionText": "A Q-Learning agent in a video game receives sparse rewards only when completing levels. How can learning speed be improved?",
      "options": [
        "Reduce discount factor",
        "Set learning rate to zero",
        "Ignore exploration",
        "Introduce intermediate rewards for milestones"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Reward shaping provides more frequent feedback, improving learning speed."
    },
    {
      "id": 83,
      "questionText": "In a dynamic environment with moving obstacles, why might Q-Learning converge slowly?",
      "options": [
        "Rewards are always positive",
        "Learning rate too small",
        "Discount factor is zero",
        "Non-stationary environment causes changing optimal Q-values"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Changing environment requires continual adaptation, slowing convergence."
    },
    {
      "id": 84,
      "questionText": "A warehouse robot using Q-Learning keeps picking inefficient paths. How can policy improvement be encouraged?",
      "options": [
        "Adjust reward for efficiency and continue exploration with ε-greedy",
        "Set discount factor to zero",
        "Ignore inefficiency",
        "Reduce learning rate to zero"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Reward shaping and proper exploration help agent discover optimal, efficient paths."
    },
    {
      "id": 85,
      "questionText": "In a financial portfolio scenario, the agent overestimates risk-adjusted return. Which technique reduces bias?",
      "options": [
        "Double Q-Learning with separate estimators",
        "Set discount factor to 1",
        "Ignore stochasticity",
        "Use single-step greedy policy"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Double Q-Learning mitigates overestimation in stochastic rewards or returns."
    },
    {
      "id": 86,
      "questionText": "An agent navigates a multi-floor building with Q-Learning. Learning is slow due to state explosion. Which solution is best?",
      "options": [
        "Increase tabular Q-table indefinitely",
        "Ignore exploration",
        "Reduce discount factor to zero",
        "Use function approximation or state aggregation"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Function approximation allows handling large state spaces without storing all combinations."
    },
    {
      "id": 87,
      "questionText": "In a stochastic taxi environment, the agent receives negative rewards occasionally due to random delays. How can learning remain stable?",
      "options": [
        "Ignore negative rewards",
        "Use greedy policy only",
        "Set discount factor to zero",
        "Smooth Q-value updates with moderate α and sufficient exploration"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Smoothing reduces the effect of random negative rewards on Q-values."
    },
    {
      "id": 88,
      "questionText": "A Q-Learning agent repeatedly chooses high reward but risky actions in a factory. How to improve policy?",
      "options": [
        "Ignore risky actions",
        "Reduce learning rate to zero",
        "Adjust reward function to penalize risk and encourage safe behavior",
        "Use discount factor zero"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Incorporating risk penalties ensures agent balances reward and safety."
    },
    {
      "id": 89,
      "questionText": "During Q-Learning, the agent oscillates between states with similar Q-values. Which adjustment stabilizes learning?",
      "options": [
        "Remove exploration",
        "Reduce learning rate α or adjust rewards to break ties",
        "Increase discount factor γ to 1",
        "Ignore oscillations"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Slower updates and differentiating rewards stabilize Q-value updates."
    },
    {
      "id": 90,
      "questionText": "An agent receives inconsistent sensor readings affecting rewards. Which technique improves Q-Learning stability?",
      "options": [
        "Use greedy policy only",
        "Set discount factor to zero",
        "Ignore rewards",
        "Use averaging of Q-value updates or low α"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Averaging and lower learning rate reduce fluctuations caused by noisy rewards."
    },
    {
      "id": 91,
      "questionText": "In a multi-agent Q-Learning environment, agents’ policies keep changing, making the environment non-stationary. What is a possible solution?",
      "options": [
        "Reduce discount factor to zero",
        "Use centralized training with decentralized execution or stabilize policies",
        "Set learning rate α to zero",
        "Ignore other agents"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Centralized training helps account for other agents’ behavior while allowing independent execution."
    },
    {
      "id": 92,
      "questionText": "A robot learns Q-values in a noisy environment. To prevent overfitting to noise, which strategy is useful?",
      "options": [
        "Ignore exploration",
        "Use greedy policy only",
        "Reduce learning rate α and use averaging",
        "Set discount factor to 1"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Lower α and averaging stabilize Q-values despite noisy feedback."
    },
    {
      "id": 93,
      "questionText": "An agent in Q-Learning is exploring a large maze but never discovers a crucial shortcut. What adjustment can help?",
      "options": [
        "Remove reward",
        "Set learning rate α to zero",
        "Reduce discount factor",
        "Increase exploration rate ε temporarily"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Higher exploration increases chances of discovering rare but optimal paths."
    },
    {
      "id": 94,
      "questionText": "During Q-Learning in a grid with stochastic rewards, some Q-values fluctuate heavily. What can reduce this effect?",
      "options": [
        "Ignore rewards",
        "Remove exploration entirely",
        "Use smaller α or moving average for updates",
        "Set γ=0"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Smaller learning rate and averaging reduce sensitivity to random reward fluctuations."
    },
    {
      "id": 95,
      "questionText": "In a simulated warehouse, multiple Q-Learning agents compete for resources. Which problem arises?",
      "options": [
        "Discount factor irrelevant",
        "Rewards become deterministic",
        "Non-stationary environment due to other agents’ changing policies",
        "Exploration is unnecessary"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Other agents learning simultaneously make the environment dynamic, complicating convergence."
    },
    {
      "id": 96,
      "questionText": "A Q-Learning agent is learning a continuous control task using function approximation. What is a key risk?",
      "options": [
        "Rewards are ignored",
        "Divergence of Q-values if learning rate or network is poorly tuned",
        "Greedy policy always converges",
        "Tabular Q-table overflow"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Function approximation can destabilize learning without careful tuning of parameters."
    },
    {
      "id": 97,
      "questionText": "An agent repeatedly exploits a high reward loop in a game but never completes the main objective. What is this an example of?",
      "options": [
        "Reward hacking",
        "Exploration failure",
        "Optimal policy",
        "Random action"
      ],
      "correctAnswerIndex": 0,
      "explanation": "The agent finds a loophole in reward design, achieving high reward without completing the task."
    },
    {
      "id": 98,
      "questionText": "In Q-Learning with a high discount factor γ≈1, what challenge may arise in long-horizon tasks?",
      "options": [
        "Immediate convergence",
        "Rewards are ignored",
        "Slow convergence due to dependence on accurate long-term Q-values",
        "Exploration becomes irrelevant"
      ],
      "correctAnswerIndex": 2,
      "explanation": "High γ emphasizes future rewards, which requires learning over many steps and slows convergence."
    },
    {
      "id": 99,
      "questionText": "A delivery robot using Q-Learning keeps choosing shortest paths but risks collisions. How to improve policy?",
      "options": [
        "Ignore collisions",
        "Set γ=0",
        "Adjust reward function to penalize unsafe paths",
        "Set α=0"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Reward shaping ensures agent balances efficiency with safety."
    },
    {
      "id": 100,
      "questionText": "During Q-Learning in a noisy, stochastic environment, Q-values oscillate heavily. Which approach helps?",
      "options": [
        "Set discount factor to zero",
        "Use greedy policy only",
        "Ignore rewards",
        "Reduce learning rate α, smooth updates, and ensure sufficient exploration"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Smoothing and proper parameter tuning stabilize learning despite noisy feedback."
    }
  ]
}