{ "title": "Q-Learning Mastery: 100 MCQs", "description": "A comprehensive set of 100 multiple-choice questions on Q-Learning, covering core concepts, implementation details, and practical scenarios.", "questions": [ { "id": 1, "questionText": "Q-Learning is an example of which type of reinforcement learning?", "options": [ "On-policy learning", "Off-policy learning", "Supervised learning", "Unsupervised learning" ], "correctAnswerIndex": 1, "explanation": "Q-Learning is off-policy because it learns the optimal action-value function regardless of the agent’s current policy." }, { "id": 2, "questionText": "In Q-Learning, the Q-value represents:", "options": [ "Expected cumulative reward for a state-action pair", "Discount factor", "Immediate reward only", "Next state value" ], "correctAnswerIndex": 0, "explanation": "Q(s,a) estimates the total expected future reward starting from state s and taking action a." }, { "id": 3, "questionText": "The Q-Learning update rule uses which key component?", "options": [ "Random action selection", "Current policy only", "Max Q value of next state", "Immediate reward only" ], "correctAnswerIndex": 2, "explanation": "The max operator selects the best possible future action to update the current Q-value." }, { "id": 4, "questionText": "Which equation represents the Q-Learning update?", "options": [ "V(s) ← r only", "Q(s,a) ← Q(s,a) + α[r + γ max Q(s’,a’) − Q(s,a)]", "Policy π(s) ← π(s) + α", "TD error δ = r − V(s)" ], "correctAnswerIndex": 1, "explanation": "This standard Q-Learning formula updates Q-values based on observed reward and the estimated optimal future value." }, { "id": 5, "questionText": "The learning rate α in Q-Learning controls:", "options": [ "How much new information overrides old Q-values", "Exploration probability", "Discount of future rewards", "Reward shaping" ], "correctAnswerIndex": 0, "explanation": "α determines how quickly the Q-values are updated using new information." }, { "id": 6, "questionText": "The discount factor γ in Q-Learning affects:", "options": [ "Learning rate", "Importance of future rewards", "Immediate reward only", "Exploration strategy" ], "correctAnswerIndex": 1, "explanation": "γ weights future rewards relative to immediate rewards, controlling short-term vs long-term focus." }, { "id": 7, "questionText": "Which exploration strategy is commonly used in Q-Learning?", "options": [ "Policy gradient", "Softmax", "Random selection", "ε-greedy" ], "correctAnswerIndex": 3, "explanation": "ε-greedy balances exploration of new actions and exploitation of best-known actions." }, { "id": 8, "questionText": "Q-Learning is best suited for:", "options": [ "Continuous action spaces only", "Supervised classification", "Discrete action spaces", "Clustering problems" ], "correctAnswerIndex": 2, "explanation": "Classic Q-Learning assumes a finite set of actions for tabular updates." }, { "id": 9, "questionText": "Which component is not part of Q-Learning?", "options": [ "Policy gradient", "Action", "State", "Reward" ], "correctAnswerIndex": 0, "explanation": "Q-Learning does not directly use policy gradients; it learns optimal Q-values." }, { "id": 10, "questionText": "Q-Learning converges to the optimal Q-values if:", "options": [ "The agent explores randomly only once", "All state-action pairs are visited infinitely often and learning rate decays appropriately", "Immediate reward is always positive", "Discount factor is zero" ], "correctAnswerIndex": 1, "explanation": "Convergence requires sufficient exploration and proper decay of the learning rate." }, { "id": 11, "questionText": "The 'max' operator in Q-Learning is used to:", "options": [ "Compute immediate reward", "Adjust learning rate", "Select the best next action value for update", "Randomize Q-values" ], "correctAnswerIndex": 2, "explanation": "max_a Q(s’,a’) selects the highest estimated return from the next state." }, { "id": 12, "questionText": "Which type of reward signal does Q-Learning require?", "options": [ "Policy gradients", "Scalar rewards", "State transitions only", "Vector rewards" ], "correctAnswerIndex": 1, "explanation": "Q-Learning updates require a single scalar reward to calculate TD error." }, { "id": 13, "questionText": "In tabular Q-Learning, the Q-table stores:", "options": [ "Only action probabilities", "Q-values for all state-action pairs", "Only state values", "Immediate rewards" ], "correctAnswerIndex": 1, "explanation": "The Q-table maps every state-action combination to an estimated value." }, { "id": 14, "questionText": "If the agent follows an ε-greedy policy, it:", "options": [ "Selects the best action most of the time but explores randomly sometimes", "Updates Q-values without actions", "Always selects the action with highest Q-value", "Only explores randomly" ], "correctAnswerIndex": 0, "explanation": "ε-greedy balances exploitation and exploration for better learning." }, { "id": 15, "questionText": "Q-Learning is considered off-policy because it:", "options": [ "Updates values based on current policy only", "Learns the optimal Q-values independently of the policy being followed", "Requires supervised labels", "Uses Monte Carlo returns exclusively" ], "correctAnswerIndex": 1, "explanation": "Off-policy learning allows using exploratory policy while learning optimal Q-values." }, { "id": 16, "questionText": "Which scenario is suitable for Q-Learning?", "options": [ "Gridworld navigation with discrete actions", "Principal component analysis", "Continuous robot control without discretization", "Unsupervised clustering" ], "correctAnswerIndex": 0, "explanation": "Tabular Q-Learning works best in environments with discrete actions and states." }, { "id": 17, "questionText": "Which condition may slow Q-Learning convergence?", "options": [ "Low exploration and high learning rate", "Decaying learning rate", "Proper exploration and small learning rate", "Infinite state-action visits" ], "correctAnswerIndex": 0, "explanation": "Insufficient exploration or unstable learning rates can slow convergence." }, { "id": 18, "questionText": "In Q-Learning, what happens if α = 1?", "options": [ "Discount factor becomes 0", "Agent ignores rewards", "Q-values are updated only based on latest observation, ignoring old values", "Learning rate is too slow" ], "correctAnswerIndex": 2, "explanation": "Setting α=1 completely replaces old Q-values with new estimates." }, { "id": 19, "questionText": "The TD error in Q-Learning is:", "options": [ "Immediate reward minus zero", "V(s) − r", "δ = r + γ max Q(s’,a’) − Q(s,a)", "Policy gradient only" ], "correctAnswerIndex": 2, "explanation": "TD error measures difference between predicted and target Q-values." }, { "id": 20, "questionText": "Which of the following is true about discount factor γ?", "options": [ "γ < 0", "0 ≤ γ ≤ 1, controlling future reward importance", "γ > 1", "γ irrelevant for Q-Learning" ], "correctAnswerIndex": 1, "explanation": "γ controls how much future rewards are considered in Q-value updates." }, { "id": 21, "questionText": "What happens if γ = 0 in Q-Learning?", "options": [ "Exploration rate increases", "Learning stops", "Agent values long-term rewards equally", "Agent only considers immediate rewards" ], "correctAnswerIndex": 3, "explanation": "Zero discount factor ignores future rewards, making agent myopic." }, { "id": 22, "questionText": "Which is an advantage of Q-Learning?", "options": [ "Works only for small state spaces", "Requires supervised labels", "Cannot handle stochastic rewards", "Can learn optimal policy without following it" ], "correctAnswerIndex": 3, "explanation": "Off-policy learning allows Q-Learning to learn optimal values even with exploratory actions." }, { "id": 23, "questionText": "What is the main limitation of tabular Q-Learning?", "options": [ "Cannot learn from rewards", "Cannot use TD error", "Cannot handle discrete actions", "Does not scale to large or continuous state spaces" ], "correctAnswerIndex": 3, "explanation": "Tabular storage becomes infeasible for large or continuous environments." }, { "id": 24, "questionText": "Which type of learning does Q-Learning rely on?", "options": [ "Supervised learning", "Reinforcement learning", "Self-supervised learning", "Unsupervised learning" ], "correctAnswerIndex": 1, "explanation": "Q-Learning is a reinforcement learning algorithm using rewards to learn optimal actions." }, { "id": 25, "questionText": "Which is an essential component for Q-Learning?", "options": [ "Reward signal", "Feature scaling only", "Loss function gradient", "Cluster labels" ], "correctAnswerIndex": 0, "explanation": "Q-Learning requires a scalar reward to update Q-values." }, { "id": 26, "questionText": "Which aspect differentiates SARSA from Q-Learning?", "options": [ "SARSA cannot learn", "Q-Learning uses supervised labels", "SARSA is on-policy, Q-Learning is off-policy", "SARSA ignores rewards" ], "correctAnswerIndex": 2, "explanation": "SARSA updates Q-values using the action actually taken (on-policy)." }, { "id": 27, "questionText": "Which operator ensures Q-Learning selects best next action in value update?", "options": [ "Average operator", "Min operator", "Random operator", "Max operator over next Q-values" ], "correctAnswerIndex": 3, "explanation": "max_a Q(s’,a’) chooses the highest estimated return for next state." }, { "id": 28, "questionText": "Q-Learning can handle stochastic environments because:", "options": [ "It ignores randomness", "It uses expected rewards over time", "It requires deterministic transitions", "It uses supervised labels" ], "correctAnswerIndex": 1, "explanation": "Q-values converge to expected returns even when rewards or transitions are probabilistic." }, { "id": 29, "questionText": "Which strategy balances exploration and exploitation in Q-Learning?", "options": [ "Random selection only", "Pure greedy policy", "Policy gradient", "ε-greedy policy" ], "correctAnswerIndex": 3, "explanation": "ε-greedy allows occasional random actions to explore while usually exploiting best-known actions." }, { "id": 30, "questionText": "What does convergence of Q-Learning mean?", "options": [ "Learning rate increases infinitely", "Agent stops moving", "Rewards become zero", "Q-values approximate optimal values for all state-action pairs" ], "correctAnswerIndex": 3, "explanation": "Convergence means the Q-table represents optimal expected returns, and the agent can act optimally." }, { "id": 31, "questionText": "If an agent in a gridworld uses Q-Learning with γ=0.9, what does this imply?", "options": [ "Only immediate reward matters", "Agent acts randomly", "Future rewards are important but slightly discounted", "Future rewards are ignored" ], "correctAnswerIndex": 2, "explanation": "A discount factor of 0.9 prioritizes long-term rewards while still considering immediate rewards." }, { "id": 32, "questionText": "Which condition can cause Q-Learning to fail to converge?", "options": [ "Low learning rate", "Sparse rewards", "Insufficient exploration of state-action space", "High discount factor" ], "correctAnswerIndex": 2, "explanation": "If some state-action pairs are never visited, Q-values for those pairs cannot converge." }, { "id": 33, "questionText": "What is the purpose of decaying ε in ε-greedy policy?", "options": [ "Increase randomness constantly", "Reduce exploration over time to favor exploitation", "Ignore exploration", "Stabilize learning rate" ], "correctAnswerIndex": 1, "explanation": "Decaying ε gradually shifts agent behavior from exploration to exploitation as it learns." }, { "id": 34, "questionText": "In a cliff-walking environment, Q-Learning might:", "options": [ "Learn to avoid the cliff using negative rewards", "Receive only positive rewards", "Ignore cliffs completely", "Always fall off" ], "correctAnswerIndex": 0, "explanation": "Negative rewards for falling off the cliff guide the agent to safe paths." }, { "id": 35, "questionText": "In Q-Learning, increasing α too high can cause:", "options": [ "Slow convergence", "Ignoring rewards", "Unstable learning and oscillating Q-values", "Reduced exploration" ], "correctAnswerIndex": 2, "explanation": "High learning rate can make Q-values change too abruptly and prevent convergence." }, { "id": 36, "questionText": "Which environment property makes Q-Learning suitable?", "options": [ "Continuous states only", "No reward signal", "Continuous actions only", "Discrete state and action space" ], "correctAnswerIndex": 3, "explanation": "Tabular Q-Learning requires discrete states and actions to store Q-values." }, { "id": 37, "questionText": "An agent receives noisy rewards. How does Q-Learning handle this?", "options": [ "Ignores all rewards", "Updates only once", "Randomly resets Q-table", "Estimates expected Q-values over multiple updates" ], "correctAnswerIndex": 3, "explanation": "Repeated updates average out noise, leading to stable Q-value estimates." }, { "id": 38, "questionText": "What does the max operator in Q-Learning introduce that SARSA does not?", "options": [ "Optimism about future rewards (off-policy)", "Exploration strategy", "Immediate reward only", "Policy gradients" ], "correctAnswerIndex": 0, "explanation": "Q-Learning considers best possible next action regardless of the policy, making it off-policy." }, { "id": 39, "questionText": "Which scenario requires function approximation in Q-Learning?", "options": [ "Supervised datasets", "Small discrete environments", "Large state spaces where tabular storage is impractical", "Clustering tasks" ], "correctAnswerIndex": 2, "explanation": "Function approximation (like neural networks) generalizes across states in large spaces." }, { "id": 40, "questionText": "Which is true about convergence speed in Q-Learning?", "options": [ "Faster with negative rewards only", "Faster with zero exploration", "Independent of learning rate", "Depends on learning rate, exploration, and reward structure" ], "correctAnswerIndex": 3, "explanation": "Proper tuning of α, ε, and reward design affects how quickly Q-values converge." }, { "id": 41, "questionText": "In a stochastic gridworld, Q-Learning can learn optimal actions because:", "options": [ "It ignores transitions", "It only uses immediate reward", "It estimates expected Q-values over many episodes", "It does not update Q-values" ], "correctAnswerIndex": 2, "explanation": "Averaging over multiple experiences accounts for stochasticity in transitions and rewards." }, { "id": 42, "questionText": "What is the difference between Q-Learning and SARSA in terms of risk?", "options": [ "Q-Learning may be more optimistic, SARSA is more conservative", "SARSA ignores rewards", "Q-Learning is on-policy", "SARSA ignores exploration" ], "correctAnswerIndex": 0, "explanation": "Q-Learning assumes optimal next action; SARSA updates based on actual next action, making it safer in risky environments." }, { "id": 43, "questionText": "Which combination of parameters can stabilize Q-Learning in noisy environments?", "options": [ "High learning rate and zero exploration", "Ignore rewards", "Moderate learning rate and sufficient exploration", "Low discount factor and random policy" ], "correctAnswerIndex": 2, "explanation": "Moderate α and proper exploration reduce oscillations in Q-values." }, { "id": 44, "questionText": "When using Q-Learning with γ close to 1 in long-horizon tasks, the agent:", "options": [ "Ignores future rewards", "Only explores randomly", "Focuses on long-term rewards", "Receives unstable rewards" ], "correctAnswerIndex": 2, "explanation": "High discount factor prioritizes cumulative rewards far into the future." }, { "id": 45, "questionText": "In Q-Learning, why is it necessary to visit all state-action pairs?", "options": [ "To update only visited states", "To ensure convergence to true optimal Q-values", "To decrease learning rate", "To ignore reward signals" ], "correctAnswerIndex": 1, "explanation": "Without exploring all state-action pairs, Q-values for some states may never converge." }, { "id": 46, "questionText": "Which technique can help Q-Learning in large state spaces?", "options": [ "Random action selection only", "Tabular Q-values only", "Function approximation with neural networks", "Ignore exploration" ], "correctAnswerIndex": 2, "explanation": "Approximation allows generalization to unseen states and reduces memory requirements." }, { "id": 47, "questionText": "In Q-Learning, what is the impact of too small α?", "options": [ "Oscillating Q-values", "Ignoring future rewards", "Faster convergence", "Slow learning" ], "correctAnswerIndex": 3, "explanation": "Small learning rate updates Q-values slowly, making learning take longer." }, { "id": 48, "questionText": "Which scenario demonstrates reward shaping in Q-Learning?", "options": [ "Reward only at episode end", "Providing intermediate positive rewards for partial progress", "Ignore reward signal", "Random rewards" ], "correctAnswerIndex": 1, "explanation": "Shaping rewards guide the agent step-by-step, improving convergence speed." }, { "id": 49, "questionText": "Which factor determines how much Q-Learning values immediate vs future reward?", "options": [ "Reward scaling only", "Discount factor γ", "Learning rate α", "Exploration ε" ], "correctAnswerIndex": 1, "explanation": "γ weighs future reward relative to immediate reward." }, { "id": 50, "questionText": "An agent overestimates Q-values due to stochastic rewards. Which can help?", "options": [ "Set γ=0", "Ignore rewards", "Use averaging or smoothing techniques", "Remove exploration" ], "correctAnswerIndex": 2, "explanation": "Averaging reduces overestimation caused by random fluctuations in reward." }, { "id": 51, "questionText": "Which advantage does Q-Learning have over Monte Carlo methods?", "options": [ "Cannot handle stochastic rewards", "Can update Q-values before episode ends (online learning)", "Needs supervised labels", "Requires full episode only" ], "correctAnswerIndex": 1, "explanation": "TD update allows learning from partial sequences without waiting for episode completion." }, { "id": 52, "questionText": "In an environment with many terminal states, Q-Learning:", "options": [ "Requires supervised signal", "Still updates Q-values until convergence", "Fails to update", "Ignores terminal states" ], "correctAnswerIndex": 1, "explanation": "Terminal states are treated as having zero future reward, allowing updates to proceed." }, { "id": 53, "questionText": "In Q-Learning, which factor controls optimism in action selection?", "options": [ "Discount factor γ only", "max operator over next Q-values", "Learning rate α only", "Exploration ε only" ], "correctAnswerIndex": 1, "explanation": "The max operator assumes the agent can always take the best next action, leading to optimistic estimates." }, { "id": 54, "questionText": "Which property makes Q-Learning off-policy?", "options": [ "Updates Q-values only for chosen action", "Updates Q-values using best possible action, not necessarily the action taken", "Requires reward shaping", "Uses Monte Carlo returns only" ], "correctAnswerIndex": 1, "explanation": "Off-policy means learning optimal Q-values independently of current behavior policy." }, { "id": 55, "questionText": "When using Q-Learning in a real robot, what challenge arises?", "options": [ "Rewards cannot be negative", "Tabular Q-table is sufficient", "Immediate rewards only", "Large state-action space requiring function approximation" ], "correctAnswerIndex": 3, "explanation": "Physical robots often have continuous states, necessitating approximation instead of tabular Q-tables." }, { "id": 56, "questionText": "Which technique reduces overestimation bias in Q-Learning?", "options": [ "Increase γ to 1", "Double Q-Learning", "Ignore exploration", "TD(0) only" ], "correctAnswerIndex": 1, "explanation": "Double Q-Learning uses two value estimators to prevent overestimating max Q-values." }, { "id": 57, "questionText": "Which factor helps avoid Q-Learning getting stuck in suboptimal policies?", "options": [ "Only immediate reward", "Sufficient exploration (ε-greedy)", "Ignoring future rewards", "Zero learning rate" ], "correctAnswerIndex": 1, "explanation": "Exploration ensures the agent discovers better actions over time." }, { "id": 58, "questionText": "In Q-Learning, high variance in reward signals can be addressed by:", "options": [ "Reducing discount factor to zero", "Averaging over multiple updates or smoothing Q-values", "Random exploration only", "Ignoring reward signals" ], "correctAnswerIndex": 1, "explanation": "Smoothing reduces fluctuations from noisy reward signals, stabilizing learning." }, { "id": 59, "questionText": "Which approach is needed when state space is continuous in Q-Learning?", "options": [ "Function approximation using neural networks or tile coding", "Tabular Q-learning", "Immediate reward update only", "Random exploration only" ], "correctAnswerIndex": 0, "explanation": "Continuous states cannot be stored in tables, so approximation methods are used." }, { "id": 60, "questionText": "If an agent converges slowly, which adjustment can help?", "options": [ "Ignore rewards", "Increase exploration and adjust learning rate", "Reduce discount factor to 0", "Remove ε-greedy strategy" ], "correctAnswerIndex": 1, "explanation": "Proper tuning of exploration and learning rate accelerates convergence." }, { "id": 61, "questionText": "Which scenario illustrates reward hacking in Q-Learning?", "options": [ "Sparse rewards guide learning properly", "All Q-values remain zero", "Agent follows intended task exactly", "Agent exploits unintended behavior to get high reward without completing task" ], "correctAnswerIndex": 3, "explanation": "Reward hacking happens when the agent finds loopholes in the reward function." }, { "id": 62, "questionText": "In episodic tasks, Q-Learning updates:", "options": [ "Only at episode end", "Randomly without rewards", "Only for terminal state", "After every step using TD update" ], "correctAnswerIndex": 3, "explanation": "Q-values are updated after each step using temporal-difference (TD) update." }, { "id": 63, "questionText": "Which combination stabilizes learning in stochastic, large environments?", "options": [ "Zero learning rate", "Tabular Q-learning only", "Ignore reward signal", "Function approximation + proper α + sufficient exploration" ], "correctAnswerIndex": 3, "explanation": "Function approximation generalizes across states and proper tuning ensures stable learning." }, { "id": 64, "questionText": "Which action selection method ensures exploration in Q-Learning?", "options": [ "Always greedy", "ε-greedy policy", "Random without Q-values", "Policy gradient only" ], "correctAnswerIndex": 1, "explanation": "ε-greedy policy randomly explores some actions to prevent getting stuck in suboptimal paths." }, { "id": 65, "questionText": "High γ and delayed reward may cause:", "options": [ "Faster random exploration", "Ignoring rewards", "Slow learning and dependency on accurate Q-values", "Immediate convergence" ], "correctAnswerIndex": 2, "explanation": "High γ emphasizes long-term reward, which requires careful learning over many steps." }, { "id": 66, "questionText": "In Q-Learning, which factor prevents overestimation bias?", "options": [ "No exploration", "Double Q-Learning", "Single Q-table always", "Zero discount factor" ], "correctAnswerIndex": 1, "explanation": "Using two Q-tables reduces the risk of overestimating action values." }, { "id": 67, "questionText": "What is the role of temporal-difference (TD) in Q-Learning?", "options": [ "Random Q-value assignment", "Requires full episode", "Updates Q-values incrementally using observed rewards and estimated future Q-values", "Ignores rewards" ], "correctAnswerIndex": 2, "explanation": "TD allows learning online without waiting for episode completion." }, { "id": 68, "questionText": "Which environment is challenging for tabular Q-Learning?", "options": [ "Deterministic rewards", "High-dimensional continuous state spaces", "Single-step tasks", "Small discrete grids" ], "correctAnswerIndex": 1, "explanation": "Large continuous spaces cannot store Q-values in tables; function approximation is required." }, { "id": 69, "questionText": "What does convergence in Q-Learning indicate?", "options": [ "Agent stops learning", "Rewards are zero", "Discount factor is ignored", "Q-values have stabilized near optimal values for all state-action pairs" ], "correctAnswerIndex": 3, "explanation": "Convergence means Q-values reflect true expected returns, allowing optimal policy selection." }, { "id": 70, "questionText": "Which is a limitation of standard Q-Learning?", "options": [ "Does not scale to large or continuous spaces without function approximation", "Cannot handle stochastic rewards", "Cannot learn discrete actions", "Requires supervised labels" ], "correctAnswerIndex": 0, "explanation": "Tabular Q-Learning is impractical for large or continuous environments; approximation methods are needed." }, { "id": 71, "questionText": "An autonomous car uses Q-Learning to navigate traffic. If it repeatedly chooses a risky shortcut with occasional high reward but often crashes, how should the agent be adjusted?", "options": [ "Reduce learning rate to zero", "Increase penalty for crashes and adjust ε-greedy to explore safer routes", "Ignore crashes and maximize reward", "Decrease discount factor to zero" ], "correctAnswerIndex": 1, "explanation": "Penalizing crashes guides the agent to safer policies, and proper exploration ensures it finds optimal routes." }, { "id": 72, "questionText": "A robot learns to pick objects using Q-Learning with sparse rewards only at completion. Learning is very slow. Which technique can help?", "options": [ "Ignore rewards", "Decrease learning rate to zero", "Remove discount factor", "Reward shaping: provide intermediate rewards for partial progress" ], "correctAnswerIndex": 3, "explanation": "Reward shaping accelerates learning by giving feedback during intermediate steps." }, { "id": 73, "questionText": "In a stochastic gridworld, an agent sometimes receives higher rewards for wrong actions due to randomness. Which method reduces overestimation?", "options": [ "Double Q-Learning with two value estimators", "Increase discount factor to 1", "Reduce learning rate to zero", "Single Q-table only" ], "correctAnswerIndex": 0, "explanation": "Double Q-Learning mitigates overestimation by using two independent Q-value estimates." }, { "id": 74, "questionText": "A Q-Learning agent is stuck in a local optimum due to greedy action selection. What can improve exploration?", "options": [ "Use single greedy action forever", "Set learning rate to zero", "Increase ε in ε-greedy or use decaying exploration", "Ignore rewards" ], "correctAnswerIndex": 2, "explanation": "Higher exploration probability allows discovering better actions outside local optimum." }, { "id": 75, "questionText": "In a multi-agent Q-Learning environment, agents’ actions influence each other. Which is a challenge?", "options": [ "Single agent updates suffice", "Discount factor becomes negative", "Rewards are ignored", "Non-stationary environment due to other agents learning simultaneously" ], "correctAnswerIndex": 3, "explanation": "Other agents’ learning changes environment dynamics, making convergence harder." }, { "id": 76, "questionText": "An agent using Q-Learning in a continuous state environment shows poor performance. Which approach is appropriate?", "options": [ "Increase tabular Q-table size indefinitely", "Ignore exploration", "Set learning rate to zero", "Use function approximation (neural networks) to estimate Q-values" ], "correctAnswerIndex": 3, "explanation": "Continuous states cannot be stored in tables; function approximation allows generalization." }, { "id": 77, "questionText": "During Q-Learning in a complex maze, the agent repeatedly oscillates between two states. Which adjustment may help?", "options": [ "Remove discount factor", "Increase ε to 1 permanently", "Decrease learning rate α slightly or adjust reward structure", "Ignore oscillations" ], "correctAnswerIndex": 2, "explanation": "Oscillations indicate learning instability; careful tuning of α and reward shaping stabilizes learning." }, { "id": 78, "questionText": "In Q-Learning for a trading bot, the agent overestimates future profits due to stochastic market fluctuations. Which strategy helps?", "options": [ "Ignore stochasticity", "Double Q-Learning to reduce overestimation bias", "Single Q-table with γ=1", "Remove exploration" ], "correctAnswerIndex": 1, "explanation": "Double Q-Learning separates action selection and evaluation to avoid bias from stochastic rewards." }, { "id": 79, "questionText": "A drone uses Q-Learning to navigate and avoid obstacles. It receives frequent collisions initially. How can learning be stabilized?", "options": [ "Assign strong negative rewards for collisions and gradually decay ε", "Ignore collisions", "Increase learning rate to 1", "Reduce discount factor to zero" ], "correctAnswerIndex": 0, "explanation": "Penalizing unsafe actions combined with proper exploration encourages safer policies." }, { "id": 80, "questionText": "An agent in Q-Learning receives inconsistent sensor readings affecting rewards. Which technique improves stability?", "options": [ "Smooth Q-value updates using averaging or low learning rate", "Use single-step greedy policy", "Ignore rewards", "Set discount factor to zero" ], "correctAnswerIndex": 0, "explanation": "Smoothing or smaller α reduces fluctuations caused by noisy feedback." }, { "id": 81, "questionText": "In a delivery robot scenario, the agent reaches destination quickly but takes unsafe shortcuts. What should be modified in Q-Learning?", "options": [ "Set γ=0", "Reduce learning rate to zero", "Increase penalty for unsafe actions and adjust reward structure", "Ignore unsafe actions" ], "correctAnswerIndex": 2, "explanation": "Adjusting penalties and rewards guides agent to safer optimal paths." }, { "id": 82, "questionText": "A Q-Learning agent in a video game receives sparse rewards only when completing levels. How can learning speed be improved?", "options": [ "Reduce discount factor", "Set learning rate to zero", "Ignore exploration", "Introduce intermediate rewards for milestones" ], "correctAnswerIndex": 3, "explanation": "Reward shaping provides more frequent feedback, improving learning speed." }, { "id": 83, "questionText": "In a dynamic environment with moving obstacles, why might Q-Learning converge slowly?", "options": [ "Rewards are always positive", "Learning rate too small", "Discount factor is zero", "Non-stationary environment causes changing optimal Q-values" ], "correctAnswerIndex": 3, "explanation": "Changing environment requires continual adaptation, slowing convergence." }, { "id": 84, "questionText": "A warehouse robot using Q-Learning keeps picking inefficient paths. How can policy improvement be encouraged?", "options": [ "Adjust reward for efficiency and continue exploration with ε-greedy", "Set discount factor to zero", "Ignore inefficiency", "Reduce learning rate to zero" ], "correctAnswerIndex": 0, "explanation": "Reward shaping and proper exploration help agent discover optimal, efficient paths." }, { "id": 85, "questionText": "In a financial portfolio scenario, the agent overestimates risk-adjusted return. Which technique reduces bias?", "options": [ "Double Q-Learning with separate estimators", "Set discount factor to 1", "Ignore stochasticity", "Use single-step greedy policy" ], "correctAnswerIndex": 0, "explanation": "Double Q-Learning mitigates overestimation in stochastic rewards or returns." }, { "id": 86, "questionText": "An agent navigates a multi-floor building with Q-Learning. Learning is slow due to state explosion. Which solution is best?", "options": [ "Increase tabular Q-table indefinitely", "Ignore exploration", "Reduce discount factor to zero", "Use function approximation or state aggregation" ], "correctAnswerIndex": 3, "explanation": "Function approximation allows handling large state spaces without storing all combinations." }, { "id": 87, "questionText": "In a stochastic taxi environment, the agent receives negative rewards occasionally due to random delays. How can learning remain stable?", "options": [ "Ignore negative rewards", "Use greedy policy only", "Set discount factor to zero", "Smooth Q-value updates with moderate α and sufficient exploration" ], "correctAnswerIndex": 3, "explanation": "Smoothing reduces the effect of random negative rewards on Q-values." }, { "id": 88, "questionText": "A Q-Learning agent repeatedly chooses high reward but risky actions in a factory. How to improve policy?", "options": [ "Ignore risky actions", "Reduce learning rate to zero", "Adjust reward function to penalize risk and encourage safe behavior", "Use discount factor zero" ], "correctAnswerIndex": 2, "explanation": "Incorporating risk penalties ensures agent balances reward and safety." }, { "id": 89, "questionText": "During Q-Learning, the agent oscillates between states with similar Q-values. Which adjustment stabilizes learning?", "options": [ "Remove exploration", "Reduce learning rate α or adjust rewards to break ties", "Increase discount factor γ to 1", "Ignore oscillations" ], "correctAnswerIndex": 1, "explanation": "Slower updates and differentiating rewards stabilize Q-value updates." }, { "id": 90, "questionText": "An agent receives inconsistent sensor readings affecting rewards. Which technique improves Q-Learning stability?", "options": [ "Use greedy policy only", "Set discount factor to zero", "Ignore rewards", "Use averaging of Q-value updates or low α" ], "correctAnswerIndex": 3, "explanation": "Averaging and lower learning rate reduce fluctuations caused by noisy rewards." }, { "id": 91, "questionText": "In a multi-agent Q-Learning environment, agents’ policies keep changing, making the environment non-stationary. What is a possible solution?", "options": [ "Reduce discount factor to zero", "Use centralized training with decentralized execution or stabilize policies", "Set learning rate α to zero", "Ignore other agents" ], "correctAnswerIndex": 1, "explanation": "Centralized training helps account for other agents’ behavior while allowing independent execution." }, { "id": 92, "questionText": "A robot learns Q-values in a noisy environment. To prevent overfitting to noise, which strategy is useful?", "options": [ "Ignore exploration", "Use greedy policy only", "Reduce learning rate α and use averaging", "Set discount factor to 1" ], "correctAnswerIndex": 2, "explanation": "Lower α and averaging stabilize Q-values despite noisy feedback." }, { "id": 93, "questionText": "An agent in Q-Learning is exploring a large maze but never discovers a crucial shortcut. What adjustment can help?", "options": [ "Remove reward", "Set learning rate α to zero", "Reduce discount factor", "Increase exploration rate ε temporarily" ], "correctAnswerIndex": 3, "explanation": "Higher exploration increases chances of discovering rare but optimal paths." }, { "id": 94, "questionText": "During Q-Learning in a grid with stochastic rewards, some Q-values fluctuate heavily. What can reduce this effect?", "options": [ "Ignore rewards", "Remove exploration entirely", "Use smaller α or moving average for updates", "Set γ=0" ], "correctAnswerIndex": 2, "explanation": "Smaller learning rate and averaging reduce sensitivity to random reward fluctuations." }, { "id": 95, "questionText": "In a simulated warehouse, multiple Q-Learning agents compete for resources. Which problem arises?", "options": [ "Discount factor irrelevant", "Rewards become deterministic", "Non-stationary environment due to other agents’ changing policies", "Exploration is unnecessary" ], "correctAnswerIndex": 2, "explanation": "Other agents learning simultaneously make the environment dynamic, complicating convergence." }, { "id": 96, "questionText": "A Q-Learning agent is learning a continuous control task using function approximation. What is a key risk?", "options": [ "Rewards are ignored", "Divergence of Q-values if learning rate or network is poorly tuned", "Greedy policy always converges", "Tabular Q-table overflow" ], "correctAnswerIndex": 1, "explanation": "Function approximation can destabilize learning without careful tuning of parameters." }, { "id": 97, "questionText": "An agent repeatedly exploits a high reward loop in a game but never completes the main objective. What is this an example of?", "options": [ "Reward hacking", "Exploration failure", "Optimal policy", "Random action" ], "correctAnswerIndex": 0, "explanation": "The agent finds a loophole in reward design, achieving high reward without completing the task." }, { "id": 98, "questionText": "In Q-Learning with a high discount factor γ≈1, what challenge may arise in long-horizon tasks?", "options": [ "Immediate convergence", "Rewards are ignored", "Slow convergence due to dependence on accurate long-term Q-values", "Exploration becomes irrelevant" ], "correctAnswerIndex": 2, "explanation": "High γ emphasizes future rewards, which requires learning over many steps and slows convergence." }, { "id": 99, "questionText": "A delivery robot using Q-Learning keeps choosing shortest paths but risks collisions. How to improve policy?", "options": [ "Ignore collisions", "Set γ=0", "Adjust reward function to penalize unsafe paths", "Set α=0" ], "correctAnswerIndex": 2, "explanation": "Reward shaping ensures agent balances efficiency with safety." }, { "id": 100, "questionText": "During Q-Learning in a noisy, stochastic environment, Q-values oscillate heavily. Which approach helps?", "options": [ "Set discount factor to zero", "Use greedy policy only", "Ignore rewards", "Reduce learning rate α, smooth updates, and ensure sufficient exploration" ], "correctAnswerIndex": 3, "explanation": "Smoothing and proper parameter tuning stabilize learning despite noisy feedback." } ] }