<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Diabetes</journal-id><journal-id journal-id-type="publisher-id">diabetes</journal-id><journal-id journal-id-type="index">23</journal-id><journal-title>JMIR Diabetes</journal-title><abbrev-journal-title>JMIR Diabetes</abbrev-journal-title><issn pub-type="epub">2371-4379</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v11i1e79195</article-id><article-id pub-id-type="doi">10.2196/79195</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Personalized Type 1 Diabetes Management: Reinforcement Learning&#x2013;Based Insulin Dosing and Glucose Forecasting</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Taku</surname><given-names>Ernest M</given-names></name><degrees>BA, MBA</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Gupta</surname><given-names>Vibhuti</given-names></name><degrees>BS, MS, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Singhal</surname><given-names>Ashutosh</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Biomedical Data Science, School of Applied Computational Sciences, Meharry Medical College</institution><addr-line>Nashville</addr-line><addr-line>TN</addr-line><country>United States</country></aff><aff id="aff2"><institution>Department of Biostatistics &#x0026; Data Science, School of Public and Population Health, The University of Texas Medical Branch at Galveston</institution><addr-line>301 University Boulevard</addr-line><addr-line>Galveston</addr-line><addr-line>TX</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Li</surname><given-names>Sheyu</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Braune</surname><given-names>Katarina</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Vibhuti Gupta, BS, MS, PhD, Department of Biostatistics &#x0026; Data Science, School of Public and Population Health, The University of Texas Medical Branch at Galveston, 301 University Boulevard, Galveston, TX, 77555-1150, United States, 1 8065006843; <email>vibhgupt@utmb.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>3</day><month>6</month><year>2026</year></pub-date><volume>11</volume><elocation-id>e79195</elocation-id><history><date date-type="received"><day>16</day><month>06</month><year>2025</year></date><date date-type="rev-recd"><day>29</day><month>11</month><year>2025</year></date><date date-type="accepted"><day>03</day><month>12</month><year>2025</year></date></history><copyright-statement>&#x00A9; Ernest M Taku, Vibhuti Gupta, Ashutosh Singhal. Originally published in JMIR Diabetes (<ext-link ext-link-type="uri" xlink:href="https://diabetes.jmir.org">https://diabetes.jmir.org</ext-link>), 3.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Diabetes, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://diabetes.jmir.org/">https://diabetes.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://diabetes.jmir.org/2026/1/e79195"/><abstract><sec><title>Background</title><p>Optimizing insulin dosing and predicting future glucose levels for people with type 1 diabetes is challenging due to the dynamic nature of glucose metabolism. Traditional static insulin regimens fail to adapt to individual variability in diet, physical activity, stress, and metabolic fluctuations, leading to suboptimal glycemic control. Reinforcement learning (RL) offers a promising alternative by enabling personalized, real-time insulin adjustments that improve the balance between hyperglycemia and hypoglycemia.</p></sec><sec><title>Objective</title><p>This study aims to develop a deep Q-network (DQN)&#x2013;based RL system that dynamically personalizes insulin dosing recommendations using continuous glucose monitoring data, meal intake, and physical activity levels. By leveraging real-time data, the model adapts to patients&#x2019; evolving physiological states, enhancing glucose control and patient safety.</p></sec><sec sec-type="methods"><title>Methods</title><p>We used the OhioT1DM dataset (2018 and 2020), which includes 8 weeks of continuous glucose measurements, insulin dosing records, and physical activity data for twelve people with type 1 diabetes. The RL agent was designed with a state representation consisting of recent blood glucose levels, insulin doses, and lifestyle factors over a 2-hour window. The 2-hour window was selected based on the known pharmacodynamic profile of rapid-acting insulin (peak action within 90&#x2010;120 min), as well as the typical lag in glycemic response following meals or exercise. This window size captures both recent and delayed physiological effects while balancing data density and model stability. The action space included discrete insulin dose recommendations (eg, 0.5 U, 1 U, and 1.5 U). A reward function incentivized glucose levels within the target range (70&#x2010;180 mg/dL) while penalizing extreme deviations. The DQN model was trained to maximize reward by learning optimal dosing strategies through iterative trial and error.</p></sec><sec sec-type="results"><title>Results</title><p>Performance evaluation was conducted using both qualitative and quantitative metrics. Time-series analysis compared actual and predicted glucose levels, demonstrating effective glucose regulation. The RL model achieved a mean glucose level of 80.06 mg/dL, with a reward score of 10 during evaluation, indicating that most glucose predictions were maintained within the desired clinical range. This suggests the model has learned to regulate blood glucose effectively through adaptive insulin dosing. The root mean square error (12.39 mg/dL) was slightly higher than the mean absolute error (9.85 mg/dL), indicating stable predictions. Additionally, the percentage time in target range was 64.06%, suggesting that the model-maintained glucose within the clinically safe range for a majority of the time.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The DQN-based RL model demonstrated its effectiveness in personalized insulin dosing while minimizing the risk of hypo- and hyperglycemia. This suggests the model has learned to regulate blood glucose effectively through adaptive insulin dosing. This approach represents a significant advancement over conventional methods, offering a scalable and adaptive strategy for real-world diabetes management, along with enhancing clinical trust and transparency through explainability techniques.</p></sec></abstract><kwd-group><kwd>personalized insulin dosing</kwd><kwd>reinforcement learning</kwd><kwd>deep Q-network</kwd><kwd>adaptive insulin regimens</kwd><kwd>machine learning</kwd><kwd>health care</kwd><kwd>artificial intelligence</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>According to the National Institute of Diabetes and Digestive and Kidney Diseases, diabetes is a chronic condition affecting more than 38.7 million people in the United States, with approximately 1.6 million diagnosed with type 1 diabetes (T1D) [<xref ref-type="bibr" rid="ref1">1</xref>]. T1D is an autoimmune condition where the body&#x2019;s immune system mistakenly attacks and destroys the insulin-producing beta cells in the pancreas, resulting in little to no insulin production in the body [<xref ref-type="bibr" rid="ref2">2</xref>]. Insufficient insulin levels in individuals with T1D can cause hypoglycemia (ie, low blood sugar), hyperglycemia (ie, high blood sugar), and ketoacidosis (ie, ketone development), with potential impacts on vital organs including the heart, kidneys, eyes, and feet. Although the underlying cause of T1D is still unknown, it is widely accepted that both genetic susceptibility and environmental influences contribute significantly to its development [<xref ref-type="bibr" rid="ref3">3</xref>]. Management of T1D primarily involves regulating blood glucose (BG) levels through insulin therapy, nutritional adjustments, physical activity, and routine glucose monitoring.</p><p>Insulin is a hormone the body uses to allow sugar (glucose) to enter cells to produce energy, and it plays a critical role in T1D. Thus, insulin dosing through injections or an insulin pump is the primary treatment to compensate for the body&#x2019;s inability to produce insulin. However, managing diabetes effectively involves precise insulin dosing to maintain optimal BG levels [<xref ref-type="bibr" rid="ref4">4</xref>]. Due to variability in patients&#x2019; responses to insulin based on factors such as diet, physical activity, stress, and metabolic fluctuations, the challenge of personalizing insulin dosing is critical for improved patient outcomes. Despite advancements in continuous glucose monitoring (CGM) technologies, insulin dosing remains predominantly reactive and static. These limitations underscore the need for intelligent, adaptive systems capable of personalized treatment.</p><p>With the rapid advancement of artificial intelligence (AI) and machine learning (ML) technologies, along with the growing availability of big data in health care, the integration of AI into health care is becoming increasingly feasible [<xref ref-type="bibr" rid="ref5">5</xref>]. AI&#x2019;s growing role in health care has driven innovations in areas such as image analysis, disease diagnosis and prognosis, clinical decision support, robotic surgery, virtual assistants, and drug target screening [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref9">9</xref>]. However, there are still challenges with the dynamic adaptiveness and explainability of the models. Thus, we leveraged the potential of AI and ML to develop an adaptive and explainable system for personalized insulin dosing for people with T1D.</p><p>Several studies [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref15">15</xref>] have examined ML methods for diagnosing and predicting the early onset of type 2 diabetes mellitus (T2DM). Most of these works have applied multiple ML algorithms such as decision trees, support vector machines, random forests, gradient boosting, k-nearest neighbors, neural networks, etc., using various demographics and clinical variables to diagnose and predict diabetes onset. These approaches often rely on static training paradigms and lack the adaptability needed to accommodate dynamic or nonstationary data distributions. Many studies [<xref ref-type="bibr" rid="ref16">16</xref>-<xref ref-type="bibr" rid="ref19">19</xref>] have explored ML approaches for BG prediction. These studies have used classical time series models, support vector machines, random forests, and long-short term memory models to predict the hypoglycemic events and forecast BG in near-time horizons. These studies are limited in their ability to provide personalized predictions due to an overreliance on CGM data, without incorporating additional contextual factors such as physical activity, carbohydrate intake, and insulin dosage. Furthermore, they often experience interpretability constraints.</p><p>The traditional models often lacked the adaptability required to respond dynamically to changes in a patient&#x2019;s condition. This limitation paved the way for reinforcement learning (RL) models capable of continuous learning and adaptation. RL has attracted interest for its ability to dynamically optimize insulin dosing and predict glucose levels in real time by leveraging data from CGMs. There are some works focused on using RL for dynamic insulin recommendation and BG prediction [<xref ref-type="bibr" rid="ref20">20</xref>-<xref ref-type="bibr" rid="ref23">23</xref>]. Early works used Q-learning models [<xref ref-type="bibr" rid="ref20">20</xref>], followed by more advanced methods such as deep Q-networks (DQNs) [<xref ref-type="bibr" rid="ref21">21</xref>], actor-critic algorithms [<xref ref-type="bibr" rid="ref23">23</xref>], and model-predictive control [<xref ref-type="bibr" rid="ref22">22</xref>]. However, most of these models performed well in simulations; they required extensive manual tuning, making them impractical for long-term use in real-world settings. Thus, these approaches lack integration capabilities with real-world datasets or comprehensive evaluation against supervised models. Model explainability is another limitation in these RL-based methods, which is essential for clinical deployment. Additionally, previous studies frequently rely on synthetic datasets, offer limited explainability, and fail to benchmark RL against traditional machine learning models.</p><p>Despite the growing success of RL in health care applications, deploying these models in real-world clinical settings remains a challenge due to lack of interpretability in the models. Gottesman et al [<xref ref-type="bibr" rid="ref24">24</xref>] discussed the practical hurdles of implementing RL models in health care, such as handling noisy data, model interpretability, and patient safety. Our work directly addresses these issues by applying Shapley additive explanations (SHAP) [<xref ref-type="bibr" rid="ref25">25</xref>] and local interpretable model-agnostic explanations (LIME) [<xref ref-type="bibr" rid="ref26">26</xref>] for better model explainability and by thoroughly testing the model on various patient datasets to ensure robustness and safety. Additionally, another study [<xref ref-type="bibr" rid="ref27">27</xref>] aims to highlight the importance of simulating real-world conditions for RL models to ensure their generalization to unseen patient scenarios. We tackle this challenge by incorporating simulated testing and leveraging CGM data to train and test our RL model.</p><p>To address the above-mentioned limitations, we propose a DQN-based RL system that dynamically personalizes insulin dosing recommendations using CGM data, meal intake, and physical activity levels using a real-world dataset from the Ohio T1DM challenge [<xref ref-type="bibr" rid="ref28">28</xref>], compare the effectiveness of the built model with benchmark ML models, and incorporate explainability using SHAP [<xref ref-type="bibr" rid="ref25">25</xref>] and LIME [<xref ref-type="bibr" rid="ref26">26</xref>].</p><p>Building on prior DQN-based insulin-dosing research, our paper emphasizes an explainability-first RL framework using the real-world multimodal OhioT1DM dataset. We integrate SHAP and LIME analyses with clinically relevant vignettes (eg, premeal bolus timing and postexercise hypoglycemia mitigation) to contextualize model behavior. Our safety-aware reward rationale is supported by a compact, systematic sensitivity design&#x2014;including asymmetry, stability, and insulin on-board traces (IOB)-proxy components across multiple glycemic targets&#x2014;and by a reproducible data pipeline that harmonizes pump or CGM data with wearable sensor signals. Finally, we implement leakage-safe data splits and report mean (SD) with 95% CIs for TIR or TBR or TAR and error metrics to enhance clinical interpretability and methodological rigor.</p><p>Our study uniquely focuses on model explainability&#x2014;an essential factor for clinical trust that is largely overlooked in existing research. In this study, we applied SHAP to assess feature importance within the DQN model. SHAP values quantify the contribution of each input feature to the model&#x2019;s decisions&#x2014;for instance, illustrating how &#x201C;time since last insulin dose&#x201D; influences insulin recommendations. This explainability perspective is lacking in existing works, which have focused solely on the performance of RL algorithms without examining the rationale behind specific dosing decisions. Another distinguishing aspect of our work is the incorporation of LIME, which is used to explain individual insulin dose recommendations by approximating the DQN model with interpretable surrogate models. For instance, a clinician can interpret a recommended dose by analyzing contributing factors like meal timing or prior glucose patterns. By integrating SHAP and LIME, our work bridges the gap between black-box AI models and the level of interpretability necessary for regulatory approval and clinical implementation.</p></sec><sec id="s1-2"><title>Objective</title><p>We have developed a DQN-based RL system that dynamically personalizes insulin dosing recommendations using CGM data, meal intake, and physical activity levels, and has the capability to predict future BG levels at specified intervals (eg, 30 or 60 min) using historical event data. By leveraging real-time data, the model adapts to patients&#x2019; evolving physiological states, enhancing glucose control and patient safety. We have compared the effectiveness of our model with the benchmark ML models and incorporated explainability using SHAP and LIME to enhance model understanding. The model&#x2019;s prediction can aid in forecasting episodes of hyperglycemia or hypoglycemia and inform optimal insulin dosing and lifestyle adjustments. Our major contributions in this study are as follows:</p><list list-type="order"><list-item><p>We have proposed and developed a novel and adaptive RL-based framework for personalized insulin dosing recommendations as well as predicting future BG levels. Our framework consists of three major components:</p><list list-type="alpha-lower"><list-item><p>Deep Q-network: DQN uses a value-based approach where a neural network approximates the Q-value function to decide optimal actions for given states. This is distinct because DQN is better suited for discrete action spaces, which aligns well with insulin dosing (eg, no dose, low dose, medium dose, and high dose).</p></list-item><list-item><p>State space representation: this consists of lag features, rolling averages, and time-based features, offering a richer, temporally aware state representation. It explicitly models time since last insulin dose and time since last meal, emphasizing the physiological delay in glucose-insulin dynamics.</p></list-item><list-item><p>Reward design: the reward function penalizes extreme hypo- and hyperglycemia events (&#x003C;70 and&#x003E;250 mg/dL), with positive rewards for glucose levels within the target range (70&#x2010;180 mg/dL). Our approach introduces adaptive penalties, which vary with the severity of glucose deviation, potentially improving safety margins.</p></list-item></list></list-item><list-item><p>We have performed extensive evaluation of our RL-based framework performance with various metrics and benchmarked the performance with the state-of-the-art long short-term memory (LSTM) model.</p></list-item><list-item><p>We have incorporated explainability into the built RL-based model by integrating SHAP and LIME methods to assess feature importance and explain individual insulin dosing recommendations.</p></list-item></list></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Ethical Considerations</title><p>The OhioT1DM dataset is a deidentified dataset requested from the authors through a data use agreement. No institutional review board review or approval is required because the data are completely deidentified.</p></sec><sec id="s2-2"><title>Dataset and Preprocessing</title><p>In this study, we used the OhioT1DM dataset from 2018 and 2020, provided by the Blood Glucose Level Prediction challenge [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>]. The dataset was generated by monitoring 12 individuals with T1D over an 8-week period, capturing a range of BG-related data. It includes CGM readings recorded every five minutes, insulin delivery data from insulin pumps, and self-reported events such as meals, work, sleep, psychological stress, and physical activity, all logged via a smartphone app. Additionally, physical activity was tracked using a sensor band. The first cohort, consisting of 6 individuals, wore Basis Peak fitness bands; the dataset contains 5-minute aggregated measurements of heart rate, galvanic skin response (GSR), skin temperature, air temperature, and step count [<xref ref-type="bibr" rid="ref28">28</xref>]. The second cohort, also comprising 6 individuals, wore the Empatica Embrace; the dataset provides 1-minute aggregated measurements of GSR, skin temperature, and acceleration magnitude [<xref ref-type="bibr" rid="ref28">28</xref>]. Notably, meal and insulin data are represented as discrete user-entered values rather than continuous series like carbohydrate intake or insulin on board. The detailed data description is shown in <xref ref-type="table" rid="table1">Table 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Dataset description.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Attributes</td><td align="left" valign="bottom">Description</td><td align="left" valign="bottom">Unit</td></tr></thead><tbody><tr><td align="left" valign="top">timestamp</td><td align="left" valign="top">Date and time of the event</td><td align="left" valign="top">DateTime</td></tr><tr><td align="left" valign="top">glucose_level</td><td align="left" valign="top">Blood glucose level at a specific time</td><td align="left" valign="top">mg/dL</td></tr><tr><td align="left" valign="top">insulin_dose</td><td align="left" valign="top">Administered insulin dose during the event</td><td align="left" valign="top">Units (U)</td></tr><tr><td align="left" valign="top">carbs</td><td align="left" valign="top">Carbohydrate intake associated with a meal</td><td align="left" valign="top">grams (g)</td></tr><tr><td align="left" valign="top">meal_type</td><td align="left" valign="top">Type of meal (eg, breakfast, lunch, and snack)</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td></tr><tr><td align="left" valign="top">exercise_intensity</td><td align="left" valign="top">Intensity of physical activity</td><td align="left" valign="top">Intensity Level (1-5)</td></tr><tr><td align="left" valign="top">exercise_duration</td><td align="left" valign="top">Duration of the exercise</td><td align="left" valign="top">minutes</td></tr><tr><td align="left" valign="top">bolus_dose</td><td align="left" valign="top">Insulin dose delivered as a bolus</td><td align="left" valign="top">Units (U)</td></tr><tr><td align="left" valign="top">heart_rate</td><td align="left" valign="top">Heart rate during the event</td><td align="left" valign="top">beats per minute (bpm)</td></tr><tr><td align="left" valign="top">sleep_quality</td><td align="left" valign="top">Quality of sleep</td><td align="left" valign="top">Percent (%)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap><p>The OhioT1DM dataset consists of participant demographics: gender distribution (6 male, 6 female), age ranges (20&#x2010;80 y), and device use. All participants were on insulin pump therapy, using Medtronic 530G or 630G models with Medtronic Enlite CGM sensors. Both basal and bolus insulin data are included. While BMI and diabetes duration are not reported in the public dataset, all participants were experienced pump users, and therapy modalities were consistent across the cohort [<xref ref-type="bibr" rid="ref28">28</xref>]. The detailed description is shown in Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>The raw dataset is in XML format. We converted the XML files into CSV format first, performed preprocessing steps such as handling missing data and normalizing features, and generated engineered features, including lag, rolling, and time-based attributes, for further analysis. We extracted relevant data such as glucose levels, insulin doses (from bolus events), meals (carbs intake), exercise, and other factors. The key characteristics of the extracted data are shown in <xref ref-type="table" rid="table1">Table 1</xref>. The extracted data consists of 1,191,753 records with the attributes as shown in <xref ref-type="table" rid="table1">Table 1</xref>.</p><p>The time is aligned to 5 minutes, and Empatica 1-minute channels are down-sampled (mean or median) to 5 minutes before fusion. Missing CGM gaps are imputed using backward or forward fill up to short horizon (&#x003C;=15 min), and longer gaps are excluded and not being considered for learning or evaluation. The sensor outliers (eg, CGM&#x003C;40 mg/dL or &#x003E;400 mg/dL, nonphysiologic spikes) are clipped and flagged. The duplicate or overlapping basal and multipart bolus events (square or dual) are resolved to continuous IOB traces.</p><p>Time series and distribution plots were used to identify trends, as illustrated in the figures. <xref ref-type="fig" rid="figure1">Figure 1A and B</xref> shows the distribution of glucose levels overall and within a specific range, respectively. As shown in <xref ref-type="fig" rid="figure1">Figure 1A</xref>, there is a highly skewed distribution of glucose levels where a significant number of entries have glucose levels close to 0, which could indicate periods of hypoglycemia or potentially erroneous readings. <xref ref-type="fig" rid="figure1">Figure 1B</xref> shows the distribution of glucose levels ranging from 50 to 400, with an average level of 150. There is another peak around 100, as shown in <xref ref-type="fig" rid="figure1">Figure 1B</xref>, which is closer to typical blood sugar levels but still slightly on the lower side of normal. There are fewer instances of high glucose levels, suggesting fewer episodes of high blood sugar in the dataset.</p><p><xref ref-type="fig" rid="figure2">Figure 2A and B</xref> illustrate the glucose levels over time for 2 sample patients from the dataset. Given that glucose levels are timestamped, we analyze how these levels change over time within individual patients and across different days or times of day. We can observe a general downward trend, with minor fluctuations. This suggests the potential usefulness of considering past values (lag features) when predicting future glucose levels. The descriptive statistics for all 12 participants&#x2019; data are in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>(A) Distribution of glucose levels (left) and (B) distribution of glucose levels in specific range (right).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="diabetes_v11i1e79195_fig01.png"/></fig><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>(A) Glucose levels variations for patient 1 (left) and (B) glucose levels variations for patient 2 (right).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="diabetes_v11i1e79195_fig02.png"/></fig></sec><sec id="s2-3"><title>Model Development</title><p>RL [<xref ref-type="bibr" rid="ref30">30</xref>] focuses on the task of deriving a policy that maps states to actions in a way that maximizes cumulative reward. These problems are inherently closed-loop, as the agent&#x2019;s actions directly affect subsequent environmental states and observations. Unlike in supervised learning, the agent receives no explicit instruction on which actions to take; it must learn the optimal actions through interaction with the environment, identifying those that maximize reward via trial and error. RL has been successfully applied across various scientific domains, including robotics and control systems [<xref ref-type="bibr" rid="ref31">31</xref>], manufacturing, and combinatorial search tasks like those found in computer games [<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref33">33</xref>]. In the health care domain, RL has leveraged historical medical data, such as medical images and treatment regimens, for tasks including cancer prediction, diagnosis, and prognosis [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref35">35</xref>].</p><p>RL systems have multiple components: an agent, a policy, a reward signal, a value function, and optionally a model for the environment [<xref ref-type="bibr" rid="ref30">30</xref>]. In RL, the goal is for an agent to learn a policy that maximizes a cumulative reward through interacting with an environment. A policy represents a mapping from states to actions that dictates the agent&#x2019;s behavior at a specific point in time. It aligns with the concept of stimulus&#x2013;response associations in psychology, where the term &#x2019;stimulus&#x2019; encompasses both external inputs and internally generated cues within the organism [<xref ref-type="bibr" rid="ref30">30</xref>]. A reward signal defines the goal of a problem. At each time step, the environment provides the RL agent with a numerical reward. The agent&#x2019;s primary objective is to maximize the cumulative reward it obtains over time [<xref ref-type="bibr" rid="ref30">30</xref>]. While the reward signal reflects the immediate desirability of a given outcome, the value function captures long-term benefit. Specifically, the value of a state represents the expected cumulative reward an agent can obtain in the future, beginning from that state [<xref ref-type="bibr" rid="ref30">30</xref>]. The fourth and final component present in some RL systems is a model of the environment. This model simulates the environment&#x2019;s dynamics or, more broadly, enables the agent to make predictions about future environmental responses [<xref ref-type="bibr" rid="ref30">30</xref>].</p><p>We trained a DQN agent to maximize time in target glucose range. States included glucose, insulin, meals, and exercise. Actions were discrete insulin doses. Reward function penalized values outside 70&#x2010;180 mg/dL. The core RL equation is the Bellman equation, which forms the basis for algorithms such as Q-learning and DQN.</p><p>The Bellman Equation for Q-Learning:</p><disp-formula id="E1"><label>(1)</label><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>Q</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:mi>a</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mi>r</mml:mi><mml:mo>+</mml:mo><mml:mrow><mml:mo>&#x03B3;</mml:mo></mml:mrow><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mi>a</mml:mi><mml:mrow><mml:mo>&#x2032;</mml:mo></mml:mrow><mml:mi>Q</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>s</mml:mi><mml:mrow><mml:mo>&#x2032;</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mi>a</mml:mi><mml:mrow><mml:mo>&#x2032;</mml:mo></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><list list-type="bullet"><list-item><p>Q (s, a) is the Q-value (the expected future reward) for taking action <italic>a</italic> in state <italic>s</italic>.</p></list-item><list-item><p>r is the immediate reward received after taking action <italic>a</italic> in state <italic>s</italic>.</p></list-item><list-item><p>&#x03B3; is the discount factor (typically between 0 and 1), which determines how much future rewards are worth compared to immediate rewards.</p></list-item><list-item><p>maxa&#x2032;Q (s&#x2032;, a&#x2032;) is the maximum future Q-value (the best future reward the agent can achieve) for the next state <italic>s&#x2032;</italic> and all possible actions <italic>a&#x2032;</italic>.</p></list-item></list><p>The Bellman equation details with respect to the problem are described below:</p><list list-type="order"><list-item><p>State (s): in the context of diabetes management, the state can be represented as a combination of features such as the current glucose level, insulin dose, meal intake, exercise, etc</p></list-item><list-item><p>Action (a): the action refers to the insulin dose recommendation or adjustment (eg, deciding how much insulin to administer at the current state).</p></list-item><list-item><p>Reward (r): the reward is a feedback signal to indicate the success of the agent&#x2019;s action. In diabetes management, the reward might penalize for hypo- or hyperglycemia events and give positive rewards for keeping glucose levels within a healthy range.</p></list-item><list-item><p>Discount factor (&#x03B3;): the discount factor controls the balance between prioritizing immediate rewards (eg, maintaining glucose levels right now) and future rewards (eg, preventing long-term health issues caused by poor glucose control).</p></list-item><list-item><p>Max future Q-value: the agent estimates the maximum future reward it can achieve by taking the best action in the next state. This is used to help the agent choose actions that will not only lead to immediate benefits but also to longer-term gains.</p></list-item></list></sec><sec id="s2-4"><title>Reinforcement Learning Model Selection</title><p>The following steps describe the reinforcement learning model selection and training process:</p><list list-type="order"><list-item><p>Build and train a reinforcement learning model: develop a DQN model to optimize insulin dosing based on extracted parameters such as glucose levels. Details for the DQN model are in the section below.</p></list-item><list-item><p>Evaluate and optimize the RL model: after training, we evaluate the RL model&#x2019;s performance using both datasets [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>], optimize it, and detail the evaluation metrics.</p></list-item><list-item><p>Establish state representation: BG levels, insulin doses, meal intake, and physical activity over a recent time window (eg, 2 h).</p></list-item><list-item><p>&#x00B7; Reward function: a penalized reward system that reduces points for glucose levels outside the target range, with larger penalties for extreme hypo- or hyperglycemia events.</p></list-item><list-item><p>Implement the training loop for the DQN agent to interact with our diabetes management environment by setting up the environment and the agent, then running through multiple episodes to allow the agent to learn optimal actions based on the given state.</p></list-item><list-item><p>We set up the DQN agent and its learning mechanisms, and we train this agent using the simulation environment by repeatedly interacting with it (using the step function) and applying the replay function to learn from past actions.</p></list-item></list></sec><sec id="s2-5"><title>DQN Rule</title><p>In the context of DQNs, the Bellman equation is approximated using neural networks to learn the Q-value function. The update rule is:</p><disp-formula id="E2"><label>(2)</label><mml:math id="eqn2"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>Q</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:mi>a</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">&#x2190;</mml:mo><mml:mi>Q</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:mi>a</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:mrow><mml:mo>&#x03B1;</mml:mo></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>r</mml:mi><mml:mo>+</mml:mo><mml:mrow><mml:mo>&#x03B3;</mml:mo></mml:mrow><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mi>Q</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>s</mml:mi><mml:mrow><mml:mo>&#x2032;</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mi>a</mml:mi><mml:mrow><mml:mo>&#x2032;</mml:mo></mml:mrow><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mi>Q</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:mi>a</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mi>a</mml:mi><mml:mrow><mml:mo>&#x2032;</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><list list-type="bullet"><list-item><p><italic>&#x03B1;</italic> is the learning rate that controls how much the Q-values are updated at each step.</p></list-item><list-item><p>The expression <italic>r+&#x03B3;maxa&#x2032;Q(s&#x2032;,a&#x2032;</italic>) is the target Q-value, and the difference <italic>r+&#x03B3;maxa&#x2032;Q(s&#x2032;,a&#x2032;)&#x2212;Q(s,a</italic>) is the temporal difference error.</p></list-item></list></sec><sec id="s2-6"><title>Details With Respect to the Problem</title><p>The following key concepts are central to understanding the problem formulation in reinforcement learning&#x2013;based diabetes management systems:</p><list list-type="order"><list-item><p>Policy: the policy defines the agent&#x2019;s behavior &#x2014; which actions to take in different states. The agent seeks to learn an optimal policy that maximizes the total reward.</p></list-item><list-item><p>Exploration versus exploitation: to learn the best policy, the agent must explore different actions (exploration) while also choosing actions it believes will give the best reward based on its current knowledge (exploitation).</p></list-item></list><p>In the diabetes management system, the agent continuously learns to recommend insulin doses based on real-time data such as glucose levels, insulin history, and meal intake, balancing immediate glucose control with long-term health management.</p></sec><sec id="s2-7"><title>Feature Engineering</title><p>We have extracted below features after data analysis:</p><list list-type="order"><list-item><p>Lag features: these capture the prior values of glucose levels. Given the data collection frequency of every 5 minutes, we have created lag features representing glucose levels from 30 to 60 minutes prior.</p></list-item><list-item><p>Rolling window features: these features consist of rolling averages to smooth out fluctuations and capture trends over time, containing rolling means and standard deviations over various windows, such as 30 minutes and 60 minutes.</p></list-item><list-item><p>Time-based features: since physiological responses can vary throughout the day, including features such as the hour of the day, might capture these variations effectively. We extracted time-based features from the timestamp such as the hour of the day.</p></list-item></list></sec><sec id="s2-8"><title>Evaluation Metrics</title><p>The performance was evaluated quantitatively using standard metrics such as mean absolute error (MAE), root mean square error (RMSE), and time-in-range (TIR). MAE [<xref ref-type="bibr" rid="ref36">36</xref>] and RMSE [<xref ref-type="bibr" rid="ref37">37</xref>] are used to assess the accuracy of predictive models for glucose forecasting; however, TIR [<xref ref-type="bibr" rid="ref38">38</xref>] represents the percentage of time a person&#x2019;s glucose levels remain within the target range, typically 70&#x2010;180 mg/dL for most adults with diabetes.</p><p>The MAE [<xref ref-type="bibr" rid="ref36">36</xref>] is the average of the absolute errors (ie, the difference between the actual and predicted glucose values).</p><disp-formula id="E3"><label>(3)</label><mml:math id="eqn3"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>M</mml:mi><mml:mi>A</mml:mi><mml:mi>E</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>n</mml:mi></mml:mfrac><mml:munderover><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:munderover><mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:mrow><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>d</mml:mi><mml:mi>i</mml:mi><mml:mi>c</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>d</mml:mi><mml:mi>i</mml:mi><mml:mo>&#x2212;</mml:mo></mml:mrow></mml:msub><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mi>c</mml:mi><mml:mi>t</mml:mi><mml:mi>u</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>|</mml:mo></mml:mrow></mml:mrow></mml:mstyle></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>It can be represented, as shown in <xref ref-type="disp-formula" rid="E3">Equation 3</xref>, where n is the number of test instances. A lower MAE value leads to a better model.</p><p>RMSE [<xref ref-type="bibr" rid="ref37">37</xref>] is the square root of the average of squared errors (ie, the difference between the actual and predicted popularity values). RMSE can be represented as shown in <xref ref-type="disp-formula" rid="E4">Equation 4</xref>, where n is the number of test instances. A lower RMSE value leads to a better model.</p><disp-formula id="E4"><label>(4)</label><mml:math id="eqn4"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>R</mml:mi><mml:mi>M</mml:mi><mml:mi>S</mml:mi><mml:mi>E</mml:mi><mml:mo>=</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:msqrt><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:mi>n</mml:mi><mml:mtext>&#x00A0;</mml:mtext></mml:mrow></mml:mfrac><mml:mtext>&#x00A0;</mml:mtext><mml:munderover><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:munderover><mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>d</mml:mi><mml:mi>i</mml:mi><mml:mi>c</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>d</mml:mi><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mi>c</mml:mi><mml:mi>t</mml:mi><mml:mi>u</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>i</mml:mi><mml:mtext>&#x00A0;</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:msqrt></mml:mstyle></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>TIR is represented in <xref ref-type="disp-formula" rid="E5">Equation 5</xref>, where higher TIR is associated with better glucose control as compared to lower TIR values.</p><disp-formula id="E5"><label>(5)</label><mml:math id="eqn5"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>T</mml:mi><mml:mi>I</mml:mi><mml:mi>R</mml:mi><mml:mo>=</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>o</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>u</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>g</mml:mi><mml:mi>l</mml:mi><mml:mi>u</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>d</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi>g</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>w</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mi>h</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi><mml:mi>g</mml:mi><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>o</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>n</mml:mi><mml:mi>u</mml:mi><mml:mi>m</mml:mi><mml:mi>b</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>d</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:mfrac><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x00D7;</mml:mo><mml:mn>100</mml:mn></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula></sec><sec id="s2-9"><title>Hypothesis</title><p>We expect the RL-based model to outperform baseline approaches by increasing the percentage of time glucose levels remain in the target range while minimizing hypo- and hyperglycemia events.</p></sec><sec id="s2-10"><title>Simulation Outcome</title><sec id="s2-10-1"><title>State</title><p>The following describes the simulation state:</p><list list-type="bullet"><list-item><p>Glucose level: 80.06 mg/dL, which falls within the target range (70&#x2010;180 mg/dL). This is a healthy and realistic level for a patient managing diabetes, indicating that the simulation now better reflects the models&#x2019; physiological responses to insulin.</p></list-item><list-item><p>Time since last meal: 1 hour, which is a typical scenario and can influence immediate subsequent readings.</p></list-item><list-item><p>Time since last dose: 1 hour, reflecting recent insulin activity, which might be stabilizing BG.</p></list-item><list-item><p>Insulin type: generic bolus (mealtime) and basal (background) insulin doses.</p></list-item><list-item><p>Time of day: 12 (noon), a common time for a meal which might coincide with a postprandial glucose reading.</p></list-item></list></sec><sec id="s2-10-2"><title>Reward</title><p>The maximum positive reward is 10, reflecting optimal glucose management in the simulation. This reward confirms that the glucose level is within the desired range, and the model&#x2019;s reward system is functioning as intended to encourage similar outcomes. The adjusted parameters helped align the simulation more closely with realistic diabetes management scenarios. The positive reward outcome encourages the model to replicate or aim for similar decisions under comparable circumstances, reinforcing good management practices. We implemented a conservative, interpretable reward function prioritizing glycemic safety, consisting of in-range bonuses (70&#x2010;180 mg/dL) and out-of-range penalties, with an asymmetric variant that assigns stronger penalties to hypoglycemia than to hyperglycemia. To evaluate robustness, we conducted a structured sensitivity analysis across (1) alternative glycemic thresholds (70&#x2010;180, 80&#x2010;160, and 70&#x2010;140 mg/dL) and (2) incremental reward components, including the baseline formulation, asymmetric penalties, a stability penalty on |&#x0394;G/&#x0394;t| (ie, penalizes rapid glucose changes reducing glycemic variability where &#x0394;G represents the glucose change with respect to time t), and an IOB proxy penalty to discourage insulin stacking. Across conditions, we summarize TIR or TBR or TAR and MAE or RMSE, showing that safety-focused asymmetry consistently reduces TBR with minimal impact on TIR. These findings support the use of a simple yet clinically aligned reward structure for this initial study.</p></sec></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Experimental Setup</title><p>The models were trained for 10 episodes using simulated environments initialized with realistic patient data extracted from the OhioT1DM dataset. The model architecture consisted of a 3-layer fully connected neural network with ReLU activations, Adam optimizer (learning rate 0.001), epsilon-greedy exploration (&#x03B5; decay from 1.0 to 0.1 over episodes), and a replay buffer size of 5000. We have used time-blocked split per subject consisting of weeks 1&#x2010;6 as the training set, week 7 as validation, and week 8 as the test set. We have used TensorFlow 2.12 for neural network implementation of DQN agent, OpenAI Gym for simulating the agent&#x2019;s interaction with glucose trajectories and insulin dosing responses, Pandas and NumPy for data preprocessing and state-space formulation, and Matplotlib and seaborn for data visualization. The environment represented each state as a vector composed of time-series values over a 2-hour sliding window of glucose levels, meal carbohydrate intake, physical activity intensity or duration, and prior insulin doses. The experiments were conducted on a local machine with Apple M1 Pro 10-core CPU and 32 GB RAM. Training durations were intentionally limited to 10 episodes due to computational constraints, which balances exploratory learning without excessive runtime. The learning curves showed stabilization by episode 8, with negligible performance improvement thereafter (&#x0394;TIR&#x003C;0.5%). The learning curves demonstrated an early plateau in episodic reward and TIR, indicating initial stabilization but not the full convergence typically expected in deep RL. Future work will extend training to at least 300 episodes, incorporate soft target updates (&#x03C4;=.005), maintain a replay buffer of at least 100 k transitions, decay exploration to <italic>&#x03B5;</italic>=.05, and use leave-one-subject-out (LOSO) cross-validation. Each episode consisted of ~1000&#x2010;2000 state transitions (steps), depending on the patient&#x2019;s data length and time granularity. To ensure reproducibility, random seeds were set for NumPy and TensorFlow. Model checkpoints were stored and reloaded for inference and evaluation. Performance metrics such as MAE, RMSE, and TIR% were computed on unseen test data segmented from each patient&#x2019;s profile. Reward shaping and epsilon decay were used for policy exploration. Our study provides valuable insights into the performance of the DQN agent in managing glucose levels through insulin dosing.</p></sec><sec id="s3-2"><title>Integration With DQN Algorithm for Training</title><p>The simulation environment produces realistic and consistent results when we integrate with the DQN model for training. We initialize the DQN Agent by setting up the neural network model that will learn the Q-values. Then simulate interactions while running episodes where the agent interacts with the environment, makes decisions based on its current policy, observes rewards, and updates its policy accordingly. We then proceeded with the training loop where for each episode, reset the environment. For each time step within the episode, choose an action from the DQN agent, observe the new state and reward, and store this experience. And periodically update the DQN agent&#x2019;s neural network by replaying a batch of experiences.</p></sec><sec id="s3-3"><title>Model Performance</title><p>We compared the glucose prediction performances of two models (ie, DQN and LSTM) in this study. We observed that DQN achieves a slightly lower RMSE (12.39) as compared to LSTM (12.87), indicating it makes fewer large-scale prediction errors. RMSE penalizes larger errors more heavily, so this suggests that DQN is better at avoiding extreme outliers in glucose level prediction. Moreover, DQN is more stable and robust in maintaining glucose predictions closer to the true values, particularly in high-variance regions. LSTM achieves a significantly lower MAE (3.69), suggesting its day-to-day average predictions are closer to the actual glucose levels than DQN&#x2019;s. However, MAE does not penalize outliers as strongly as RMSE. So, while LSTM is generally more accurate on average, it is more vulnerable to large mistakes, as reflected by its higher RMSE.</p><p>The DQN model (64.06%) outperforms LSTM (62.10%) slightly for TIR. This is critical in real-world diabetes management, where maintaining glucose in a safe range directly correlates with reduced risk of complications. Since DQN is an RL-based model, reward is a key internal measure of how well it is optimizing glucose control based on its policy. A higher average reward for DQN (39.09) indicates it is more effective in learning strategies that lead to favorable outcomes (ie, TIR maintenance and avoiding hypo- or hyperglycemia). LSTM is not a reinforcement model, so its reward is derived post hoc and may not reflect learning behavior but rather performance fitting (<xref ref-type="fig" rid="figure3">Figure 3</xref>).</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Model performance comparison for deep-Q-network with LSTM. DQN: deep-Q-network; LSTM: long short-term memory; MAE: mean absolute error; RMSE: root mean square error; TIR: time-in range.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="diabetes_v11i1e79195_fig03.png"/></fig><p><xref ref-type="table" rid="table2">Table 2</xref> summarizes the comparative insights for the model performance using all evaluation metrics between DQN and LSTM models.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Summary of comparative insights.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Dimension</td><td align="left" valign="bottom">Better model</td><td align="left" valign="bottom">Reason</td></tr></thead><tbody><tr><td align="left" valign="top">RMSE<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">DQN<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">Fewer large-scale prediction errors</td></tr><tr><td align="left" valign="top">MAE<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">LSTM<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="left" valign="top">Closer average prediction to ground truth</td></tr><tr><td align="left" valign="top">TIR<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup> (%)</td><td align="left" valign="top">DQN</td><td align="left" valign="top">Maintains glucose in safe range longer</td></tr><tr><td align="left" valign="top">Reward</td><td align="left" valign="top">DQN</td><td align="left" valign="top">Optimizes policy to reinforce good outcomes</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>RMSE: root mean square error.</p></fn><fn id="table2fn2"><p><sup>b</sup>DQN: deep-Q-network.</p></fn><fn id="table2fn3"><p><sup>c</sup>MAE: mean absolute error.</p></fn><fn id="table2fn4"><p><sup>d</sup>LSTM: long short-term memory.</p></fn><fn id="table2fn5"><p><sup>e</sup>TIR: time in range.</p></fn></table-wrap-foot></table-wrap><p><xref ref-type="table" rid="table3">Table 3</xref> summarizes the statistical analysis of model performance between DQN and LSTM models. On average, the RMSE across models is about 13.1 mg/dL. The relatively small SD shows little variability between DQN (12.39) and LSTM (13.87). The wide CI reflects the small sample size (only 2 models), so additional baselines would stabilize the estimate. The MAE varied more strongly between models (DQN=9.85 vs LSTM=3.69), leading to a large SD and an unrealistic CI range. This shows that comparing only 2 models provides limited inferential strength. LSTM had notably lower MAE, which suggests better average prediction closeness, though this must be contextualized with other metrics. Both models kept patients in the clinical target range (70&#x2010;180 mg/dL)~63% of the time. The small SD indicates stable performance between DQN (64.06%) and LSTM (62.10%). Still, the CI is wide because of limited sample points; more repeated trials are needed to confirm robustness. The reward variance is high due to differences in RL design. DQN achieved a much higher average reward (39.09) than LSTM (24.54). The wide CI reflects the volatility of the RL process and the insufficient number of episodes (only 10, per earlier feedback).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Summary statistics of model performance.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Metric</td><td align="left" valign="bottom">Mean (SD)</td><td align="left" valign="bottom">95% CI</td></tr></thead><tbody><tr><td align="left" valign="top">RMSE<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="top">13.13 (1.05)</td><td align="left" valign="top">(3.73-22.53)</td></tr><tr><td align="left" valign="top">MAE<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top">6.77 (4.36)</td><td align="left" valign="top">(&#x2013;32.37 to 45.91)</td></tr><tr><td align="left" valign="top">TIR<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup> (%)</td><td align="left" valign="top">63.08 (1.39)</td><td align="left" valign="top">(50.63%-75.53%)</td></tr><tr><td align="left" valign="top">Reward</td><td align="left" valign="top">31.82 (10.29)</td><td align="left" valign="top">(&#x2013;60.62 to 124.25)</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>RMSE: root mean square error.</p></fn><fn id="table3fn2"><p><sup>b</sup>MAE: mean absolute error.</p></fn><fn id="table3fn3"><p><sup>c</sup>TIR: time in range.</p></fn></table-wrap-foot></table-wrap><p>Overall, RMSE and TIR are stable across models, suggesting both frameworks maintain reasonable glucose control. The MAE favors LSTM, but this may be influenced by LSTM&#x2019;s smoother short-term predictions, whereas DQN optimizes long-term control. The reward strongly favors DQN, aligning with its reinforcement design, but instability remains due to a few training episodes. Statistical confidence is weak due to a very small sample size (n=2 models); adding baselines (eg, autoregressive integrated moving average, random forest, and simple linear predictors) would allow meaningful variance testing (ANOVA).</p></sec><sec id="s3-4"><title>Explainability Analysis of RL Model</title><p>The RL model provides a clinically aligned strategy for insulin management. It not only predicts future glucose values but acts upon them in a self-improving feedback loop, a defining characteristic of RL. We implemented an explainability approach in the model using techniques such as SHAP and LIME, so that clinicians can understand these decisions, aiding interpretability and safety validation in future real-world trials.</p></sec><sec id="s3-5"><title>SHAP Summary Plot</title><p>The SHAP [<xref ref-type="bibr" rid="ref25">25</xref>] summary plot provides a global view of the feature importance and direction of impact on the model&#x2019;s glucose level predictions. We have applied the SHAP method to improve the interpretability of our RL model. The feature values are represented in color from low (blue) to high (red), and their impact (SHAP value) is plotted on the x-axis as shown in <xref ref-type="fig" rid="figure4">Figure 4</xref>.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>SHAP plot. SHAP: Shapley additive explanations.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="diabetes_v11i1e79195_fig04.png"/></fig><p>As shown in <xref ref-type="fig" rid="figure4">Figure 4</xref>, &#x201C;meal_carbs&#x201D; shows the highest SHAP impact range, varying from approximately &#x2212;25 to +10. High meal_carbs (red) significantly increase predicted glucose. &#x201C;exercise_duration&#x201D; ranges in SHAP values from about &#x2212;10 to +10, indicating that longer durations can both decrease or modestly increase glucose depending on context. <italic>&#x201C;</italic>bolus_dose&#x201D; mostly impacts glucose predictions negatively, ranging from &#x2212;10 to +5 SHAP units, where higher doses (red) tend to lower glucose predictions. <italic>&#x201C;</italic>exercise_intensity&#x201D; exhibits mostly negative SHAP values clustered between &#x2212;7 and 0, reflecting a minor downward pressure on glucose levels with increased intensity.</p></sec><sec id="s3-6"><title>SHAP Dependence and Interaction Effect of Meal Carbs Feature With Exercise Duration</title><p><xref ref-type="fig" rid="figure5">Figure 5</xref> shows the interaction plot of features meal carbs with exercise duration and their impact on the SHAP values. This dependence plot explores the impact of &#x201C;meal_carbs&#x201D; on the SHAP values, which reflects its influence on glucose predictions. The plot also colors each point by &#x201C;exercise_duration&#x201D; to show interaction effects, as shown in <xref ref-type="fig" rid="figure5">Figure 5</xref>.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>SHAP interaction plot. SHAP: Shapley additive explanations.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="diabetes_v11i1e79195_fig05.png"/></fig><p>When <italic>&#x201C;</italic>meal_carbs<italic>&#x201D;</italic> is normalized below 0.2, SHAP values mostly fall between &#x2212;5 and +5, showing minimal impact. As <italic>&#x201C;</italic>meal_carbs&#x201D; increases to the range of 0.4&#x2010;0.6, SHAP values often rise above +5, with some reaching ~+10. At very high &#x201C;meal_carbs&#x201D; values (near 1.0), SHAP values can reach as low as &#x2212;25 and as high as +10. Points with lower <italic>&#x201C;</italic>exercise_duration&#x201D; (more pink or red) cluster with higher positive SHAP values, indicating increased glucose risk when carbs are high and exercise is low.</p></sec><sec id="s3-7"><title>LIME Explanation Plot for Individual Predictions</title><p>The LIME plot presents the local feature contribution to a single glucose level prediction made by the model. <xref ref-type="fig" rid="figure6">Figure 6</xref> shows the LIME plot to explain the individual features&#x2019; contribution in the glucose prediction. The positive and negative bars show how much each feature pushed the predicted value up or down for this instance.</p><fig position="float" id="figure6"><label>Figure 6.</label><caption><p>Local interpretable model-agnostic explanations plot.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="diabetes_v11i1e79195_fig06.png"/></fig><p>As shown in <xref ref-type="fig" rid="figure6">Figure 6</xref>, we observed predicted glucose value: ~148.71 mg/dL (within the model&#x2019;s learned bounds of 83.38 to 160.11 mg/dL). The <italic>&#x201C;</italic>meal_carbs&#x201D; feature at 0.62 contributed +8.27 units to glucose prediction while <italic>&#x201C;bolus_dose</italic>&#x201D; at 0.50 contributed +2.90 units. This might indicate a mismatch or delay in insulin effect. The <italic>&#x201C;</italic>exercise_intensity&#x201D; feature at 0.50 contributed +2.27 units while &#x201C;exercise_duration<italic>&#x201D;</italic> feature at 0.38 contributed &#x2212;0.77 units, helping to modestly reduce predicted glucose.</p></sec><sec id="s3-8"><title>Clinical Interpretation and Implications</title><p>Our model explainability results uncover how key variables influence BG predictions by combining global (SHAP) and local (LIME) interpretability methods. The key takeaways include: high meal carbohydrate intake is a dominant factor raising glucose, especially when not balanced by insulin or physical activity. Insulin dosing (<italic>&#x201C;</italic>bolus_dose&#x201D;) generally reduces glucose but can be insufficient if the meal size is high or if timing is off. Exercise, particularly duration, helps buffer glucose spikes and improves prediction outcomes. Personalized decision-making should consider the interaction between carbs, insulin, and exercise&#x2014;simple rule-based systems may overlook these subtleties. This reinforces the need for dynamic, explainable AI systems in diabetes management to tailor recommendations based on a full patient context.</p><p>We ground the interpretability in patient-facing and clinician-facing scenarios, using concrete values we already computed: For example (LIME, single decision): predicted glucose=148.7 mg/dL; features contributing upward: &#x201C;meal_carbs=0.62&#x201D; (+8.27 SHAP or LIME units), &#x201C;bolus:_dose=0.50&#x201D; (+2.90 units; likely timing mismatch vs IOB), &#x201C;exercise_intensity=0.50&#x201D; (+2.27 units), and &#x201C;exercise_duration=0.38&#x201D; (&#x2212;0.77 units) lowering risk. The clinical actionability example is:</p><list list-type="order"><list-item><p>Premeal guidance: if SHAP shows carbs consistently the top driver (+6&#x2010;10 units) and low IOB, suggest pre-bolus timing (eg, 15&#x2010;20 min) or carb ratio adjustment for that period of day.</p></list-item><list-item><p>Postexercise hypoglycemia risk: if SHAP highlights high exercise_duration with negative contributions and declining rate-of-change, the system can warn: reduce correction bolus or add carbs to avoid late-onset lows.</p></list-item><list-item><p>Overnight stability: if basal-driven negative SHAP at night with frequent lows, suggest basal down-titration or tighter overnight safety constraints in the policy.</p></list-item></list></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>In summary, our results suggest that DQN outperformed static dosing models and matched supervised models such as LSTM in RMSE while offering dynamic control. Explainability analysis revealed critical decision factors. An RMSE of 12.39 mg/dL is also fairly low but slightly higher than the MAE, suggesting that there might be occasional predictions with larger errors. Despite this, the model&#x2019;s overall predictive accuracy is good. Percentage time in target range (64.06%). This metric indicates that the glucose levels are within the clinically safe range (70&#x2010;180 mg/dL) about 64.06% of the time. Although this represents the majority of the time, there remains room for improvement, as optimal diabetes management typically aims for a higher percentage within this range to minimize the risk of complications from high or low BG levels. An average reward of 39.09 indicates that the agent generally performs well under the reward structure we have defined, which presumably rewards the agent for maintaining glucose levels within the target range and penalizes it for deviations.</p></sec><sec id="s4-2"><title>Clinical Relevance</title><p>Explainable RL can improve clinician trust, facilitate regulatory approval, and ensure patient safety. Adaptive insulin recommendations reduce the risk of adverse glycemic events. General observations show that the majority of values lie below 200 mg/dL, which is generally considered within a manageable range for people with diabetes but includes many readings below 70 mg/dL, which are hypoglycemic. The upward trend in the data visualization suggests an increase in average glucose levels over time, though this could also reflect variabilities in patient behavior or treatment efficacy. The output from the simulation showed a glucose level of approximately 80.06 mg/dL and a reward of 10, suggesting that the adjustments to the insulin sensitivity and decay parameters have significantly improved the realism and functionality of the environment. DQN is superior in handling real-time decisions and dynamic conditions, making it more suitable for adaptive insulin dosing in personalized care. LSTM could be valuable in applications where predicting general trends is sufficient (eg, retrospective analysis and forecast dashboards). Integrating hybrid architectures (eg, DQN for policy and LSTM for predictive enhancement) may yield optimal results. Ultimately, DQN&#x2019;s better TIR and reward profile signal its strength in closed-loop, autonomous glucose management systems.</p></sec><sec id="s4-3"><title>Limitations and Future Work</title><p>Although the results are promising, our work has certain limitations. First, the data were limited to 12 participants which limits the applicability of our work to real-time deployment. However, it provides enough evidence of using RL-based models for glucose prediction and insulin dosing recommendations. A larger dataset will help to validate the models better. The future work includes hybrid DQN-LSTM ensembles, generalization to Type 2 diabetes. We aim to apply continuous enhancement of the models to optimize performance by focusing on key components such as:</p><list list-type="order"><list-item><p>Data quality and quantity: ensuring high-quality and comprehensive data can help improve model accuracy. This includes a detailed recording of insulin doses, meals, exercise, glucose levels, etc.</p></list-item><list-item><p>Feature engineering: explore different features or combinations that might improve model predictions, such as time of day, preceding meal types, or exercise intensity.</p></list-item><list-item><p>Model tuning: for LSTM and DQN, parameter tuning could optimize performance. This involves adjusting learning rate, number of trees, depth of trees, etc.</p></list-item><list-item><p>Ensemble techniques: combining predictions from different models (eg, an ensemble of LSTM and DQN) might leverage strengths and mitigate individual weaknesses.</p></list-item><list-item><p>Incremental training: for LSTM and DQN, consider using an incremental training approach to continually update the models as new data becomes available, which might help in adapting to changes in patients&#x2019; lifestyle or insulin sensitivity.</p></list-item></list></sec><sec id="s4-4"><title>Conclusions</title><p>We demonstrate that a DQN-based RL system can effectively personalize insulin dosing in T1D. The system achieved strong predictive performance, maintained glucose within safe ranges, and enhanced interpretability through SHAP and LIME. This work paves the way for clinically integrated, AI-assisted diabetes care systems. The analysis shows that while the insulin type does not lead to significant differences in glucose control, individual management strategies, including timing and dosage adjustments, are critical. The variability seen both within and across patients suggests the need for personalized diabetes management plans, closely monitored by health care providers. The frequent fluctuations and the presence of high and low extremes indicate a need for a reinforcement learning model that can dynamically adjust insulin dosages based on CGM data to better manage and stabilize glucose levels. In reinforcement learning models such as DQN, the reward function plays a key role. If the reward function is not designed correctly, the model might not learn the correct associations between actions and outcomes (predictions). A poor reward signal could result in the model always predicting values close to zero.</p></sec></sec></body><back><notes><sec><title>Funding</title><p>The research reported in this article was supported by AIM-AHEAD Coordinating Center, award number OTA-21-017, and was, in part, funded by the National Institutes of Health (NIH) Agreement No. 1OT2OD032581-02-836. The views and conclusions contained in this document are those of the authors and should not be interpreted as representing the official policies, either expressed or implied, of the NIH.</p></sec><sec><title>Data Availability</title><p>Prior to March 17, 2026, access to the dataset required an executed Data Use Agreement (DUA). The dataset is now available upon request through the website [<xref ref-type="bibr" rid="ref39">39</xref>]. The dataset curator has no objection to the publication of this article, and the previous DUA requirement has been waived.</p></sec></notes><fn-group><fn fn-type="con"><p>EMT contributed to the investigation, methodology, data analysis, visualization, and writing the original draft of the manuscript. VG contributed to data curation, resources, supervision, as well as reviewing and editing the manuscript. AS contributed to co-supervision, conceptual framing, data interpretation, clinical context, manuscript restructuring, and critical revisions. All authors reviewed and approved the final version.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">CGM</term><def><p>continuous glucose monitoring</p></def></def-item><def-item><term id="abb3">DQN</term><def><p>deep Q-network</p></def></def-item><def-item><term id="abb4">IOB</term><def><p>insulin on-board</p></def></def-item><def-item><term id="abb5">LIME</term><def><p>local interpretable model-agnostic explanations</p></def></def-item><def-item><term id="abb6">LR</term><def><p>logistic regression</p></def></def-item><def-item><term id="abb7">LSTM</term><def><p>long short-term memory</p></def></def-item><def-item><term id="abb8">MAE</term><def><p>mean absolute error</p></def></def-item><def-item><term id="abb9">RL</term><def><p>reinforcement learning</p></def></def-item><def-item><term id="abb10">RMSE</term><def><p>root mean square error</p></def></def-item><def-item><term id="abb11">SHAP</term><def><p>Shapley additive explanations</p></def></def-item><def-item><term id="abb12">T1D</term><def><p>type-1 diabetes</p></def></def-item><def-item><term id="abb13">TIR</term><def><p>time in range</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="web"><article-title>Diabetes statistics</article-title><source>National Institute of Diabetes and Digestive and Kidney Diseases</source><access-date>2025-04-23</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.niddk.nih.gov/health-information/health-statistics/diabetes-statistics">https://www.niddk.nih.gov/health-information/health-statistics/diabetes-statistics</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="web"><article-title>Type 1 diabetes</article-title><source>Mayo Clinic</source><access-date>2025-04-23</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.mayoclinic.org/diseases-conditions/type-1-diabetes/symptoms-causes/syc-20353011">https://www.mayoclinic.org/diseases-conditions/type-1-diabetes/symptoms-causes/syc-20353011</ext-link></comment></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="web"><article-title>Type 1 diabetes</article-title><source>Johns Hopkins Medicine</source><access-date>2025-04-23</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.hopkinsmedicine.org/health/conditions-and-diseases/diabetes/type-1-diabetes">https://www.hopkinsmedicine.org/health/conditions-and-diseases/diabetes/type-1-diabetes</ext-link></comment></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hirsch</surname><given-names>IB</given-names> </name></person-group><article-title>Type 1 diabetes mellitus and the use of flexible insulin regimens</article-title><source>Am Fam Physician</source><year>1999</year><month>11</month><day>15</day><volume>60</volume><issue>8</issue><fpage>2343</fpage><lpage>2352</lpage><pub-id pub-id-type="medline">10593324</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>ZM</given-names> </name></person-group><article-title>Ethics and governance of trustworthy medical artificial intelligence</article-title><source>BMC Med Inform Decis Mak</source><year>2023</year><month>01</month><day>13</day><volume>23</volume><issue>1</issue><fpage>7</fpage><pub-id pub-id-type="doi">10.1186/s12911-023-02103-9</pub-id><pub-id pub-id-type="medline">36639799</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Keskinbora</surname><given-names>KH</given-names> </name></person-group><article-title>Medical ethics considerations on artificial intelligence</article-title><source>J Clin Neurosci</source><year>2019</year><month>06</month><volume>64</volume><fpage>277</fpage><lpage>282</lpage><pub-id pub-id-type="doi">10.1016/j.jocn.2019.03.001</pub-id><pub-id pub-id-type="medline">30878282</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Attia</surname><given-names>ZI</given-names> </name><name name-style="western"><surname>Kapa</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lopez-Jimenez</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Screening for cardiac contractile dysfunction using an artificial intelligence-enabled electrocardiogram</article-title><source>Nat Med</source><year>2019</year><month>01</month><volume>25</volume><issue>1</issue><fpage>70</fpage><lpage>74</lpage><pub-id pub-id-type="doi">10.1038/s41591-018-0240-2</pub-id><pub-id pub-id-type="medline">30617318</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>&#x00C1;lvarez-Machancoses</surname><given-names>&#x00D3;</given-names> </name><name name-style="western"><surname>Fern&#x00E1;ndez-Mart&#x00ED;nez</surname><given-names>JL</given-names> </name></person-group><article-title>Using artificial intelligence methods to speed up drug discovery</article-title><source>Expert Opin Drug Discov</source><year>2019</year><month>08</month><volume>14</volume><issue>8</issue><fpage>769</fpage><lpage>777</lpage><pub-id pub-id-type="doi">10.1080/17460441.2019.1621284</pub-id><pub-id pub-id-type="medline">31140873</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Watson</surname><given-names>DS</given-names> </name><name name-style="western"><surname>Krutzinna</surname><given-names>J</given-names> </name><name name-style="western"><surname>Bruce</surname><given-names>IN</given-names> </name><etal/></person-group><article-title>Clinical applications of machine learning algorithms: beyond the black box</article-title><source>BMJ</source><year>2019</year><month>03</month><day>12</day><volume>364</volume><fpage>l886</fpage><pub-id pub-id-type="doi">10.1136/bmj.l886</pub-id><pub-id pub-id-type="medline">30862612</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hamdi</surname><given-names>T</given-names> </name><name name-style="western"><surname>Ben Ali</surname><given-names>J</given-names> </name><name name-style="western"><surname>Di Costanzo</surname><given-names>V</given-names> </name><name name-style="western"><surname>Fnaiech</surname><given-names>F</given-names> </name><name name-style="western"><surname>Moreau</surname><given-names>E</given-names> </name><name name-style="western"><surname>Ginoux</surname><given-names>JM</given-names> </name></person-group><article-title>Accurate prediction of continuous blood glucose based on support vector regression and differential evolution algorithm</article-title><source>Biocybern Biomed Eng</source><year>2018</year><volume>38</volume><issue>2</issue><fpage>362</fpage><lpage>372</lpage><pub-id pub-id-type="doi">10.1016/j.bbe.2018.02.005</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Birjais</surname><given-names>R</given-names> </name><name name-style="western"><surname>Mourya</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Chauhan</surname><given-names>R</given-names> </name><name name-style="western"><surname>Kaur</surname><given-names>H</given-names> </name></person-group><article-title>Prediction and diagnosis of future diabetes risk: a machine learning approach</article-title><source>SN Appl Sci</source><year>2019</year><month>09</month><volume>1</volume><issue>9</issue><fpage>1</fpage><lpage>8</lpage><pub-id pub-id-type="doi">10.1007/s42452-019-1117-9</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sadhu</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jadli</surname><given-names>A</given-names> </name></person-group><article-title>Early-stage diabetes risk prediction: a comparative analysis of classification algorithms</article-title><source>IARJSET</source><year>2021</year><volume>8</volume><issue>2</issue><fpage>193</fpage><lpage>201</lpage><pub-id pub-id-type="doi">10.17148/IARJSET.2021.8228</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Xue</surname><given-names>J</given-names> </name><name name-style="western"><surname>Min</surname><given-names>F</given-names> </name><name name-style="western"><surname>Ma</surname><given-names>F</given-names> </name></person-group><article-title>Research on diabetes prediction method based on machine learning</article-title><source>In Journal of Physics: Conference Series</source><year>2020</year><volume>1684</volume><publisher-name>IOP Publishing</publisher-name><fpage>012062</fpage></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Le</surname><given-names>TM</given-names> </name><name name-style="western"><surname>Vo</surname><given-names>TM</given-names> </name><name name-style="western"><surname>Pham</surname><given-names>TN</given-names> </name><name name-style="western"><surname>Dao</surname><given-names>SVT</given-names> </name></person-group><article-title>A novel wrapper&#x2013;based feature selection for early diabetes prediction enhanced with a metaheuristic</article-title><source>IEEE Access</source><year>2020</year><volume>9</volume><fpage>7869</fpage><lpage>7884</lpage><pub-id pub-id-type="doi">10.1109/ACCESS.2020.3047942</pub-id><pub-id pub-id-type="medline">32542175</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Shafi</surname><given-names>MKI</given-names> </name><name name-style="western"><surname>Sultan</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Rahman</surname><given-names>SMM</given-names> </name><name name-style="western"><surname>Hoque</surname><given-names>MM</given-names> </name></person-group><article-title>IoT based smart home: a machine learning approach</article-title><conf-name>2021 24th International Conference on Computer and Information Technology (ICCIT)</conf-name><conf-date>Dec 18-20, 2021</conf-date><conf-loc>Dhaka, Bangladesh</conf-loc><fpage>1</fpage><lpage>6</lpage><pub-id pub-id-type="doi">10.1109/ICCIT54785.2021.9689786</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xie</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Q</given-names> </name></person-group><article-title>Benchmarking machine learning algorithms on blood glucose prediction for type I diabetes in comparison with classical time-Series models</article-title><source>IEEE Trans Biomed Eng</source><year>2020</year><month>11</month><volume>67</volume><issue>11</issue><fpage>3101</fpage><lpage>3124</lpage><pub-id pub-id-type="doi">10.1109/TBME.2020.2975959</pub-id><pub-id pub-id-type="medline">32091990</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kriukova</surname><given-names>G</given-names> </name><name name-style="western"><surname>Shvai</surname><given-names>N</given-names> </name><name name-style="western"><surname>Pereverzyev</surname><given-names>SV</given-names> </name></person-group><article-title>Application of regularized ranking and collaborative filtering in predictive alarm algorithm for nocturnal hypoglycemia prevention</article-title><conf-name>2017 9th IEEE International Conference on Intelligent Data Acquisition and Advanced Computing Systems</conf-name><conf-date>Sep 21-23, 2017</conf-date><conf-loc>Bucharest, Romania</conf-loc><fpage>634</fpage><lpage>638</lpage><pub-id pub-id-type="doi">10.1109/IDAACS.2017.8095169</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mujahid</surname><given-names>O</given-names> </name><name name-style="western"><surname>Contreras</surname><given-names>I</given-names> </name><name name-style="western"><surname>Vehi</surname><given-names>J</given-names> </name></person-group><article-title>Machine learning techniques for hypoglycemia prediction: trends and challenges</article-title><source>Sensors (Basel)</source><year>2021</year><month>01</month><day>14</day><volume>21</volume><issue>2</issue><fpage>546</fpage><pub-id pub-id-type="doi">10.3390/s21020546</pub-id><pub-id pub-id-type="medline">33466659</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Kefayati</surname><given-names>S</given-names> </name><name name-style="western"><surname>Id&#x00E9;</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Predicting nocturnal hypoglycemia from continuous glucose monitoring data with extended prediction horizon</article-title><source>AMIA Annu Symp Proc</source><year>2020</year><volume>2019</volume><fpage>874</fpage><lpage>882</lpage><pub-id pub-id-type="medline">32308884</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fathi</surname><given-names>AE</given-names> </name><name name-style="western"><surname>Breton</surname><given-names>MD</given-names> </name></person-group><article-title>Using reinforcement learning to simplify mealtime insulin dosing for people with type 1 diabetes: in-silico experiments</article-title><source>IFAC-PapersOnLine</source><year>2023</year><volume>56</volume><issue>2</issue><fpage>11539</fpage><lpage>11544</lpage><pub-id pub-id-type="doi">10.1016/j.ifacol.2023.10.446</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tejedor</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hjerde</surname><given-names>SN</given-names> </name><name name-style="western"><surname>Myhre</surname><given-names>JN</given-names> </name><name name-style="western"><surname>Godtliebsen</surname><given-names>F</given-names> </name></person-group><article-title>Evaluating deep Q-learning algorithms for controlling blood glucose in in silico type 1 diabetes</article-title><source>Diagnostics (Basel)</source><year>2023</year><month>10</month><day>7</day><volume>13</volume><issue>19</issue><fpage>3150</fpage><pub-id pub-id-type="doi">10.3390/diagnostics13193150</pub-id><pub-id pub-id-type="medline">37835893</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Magni</surname><given-names>L</given-names> </name><name name-style="western"><surname>Raimondo</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Bossi</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Model predictive control of type 1 diabetes: an in silico trial</article-title><source>J Diabetes Sci Technol</source><year>2007</year><month>11</month><volume>1</volume><issue>6</issue><fpage>804</fpage><lpage>812</lpage><pub-id pub-id-type="doi">10.1177/193229680700100603</pub-id><pub-id pub-id-type="medline">19885152</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Deep reinforcement learning for automated insulin delivery systems: algorithms, applications, and prospects</article-title><source>AI</source><year>2025</year><volume>6</volume><issue>5</issue><fpage>87</fpage><pub-id pub-id-type="doi">10.3390/ai6050087</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gottesman</surname><given-names>O</given-names> </name><name name-style="western"><surname>Johansson</surname><given-names>F</given-names> </name><name name-style="western"><surname>Komorowski</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Guidelines for reinforcement learning in healthcare</article-title><source>Nat Med</source><year>2019</year><month>01</month><volume>25</volume><issue>1</issue><fpage>16</fpage><lpage>18</lpage><pub-id pub-id-type="doi">10.1038/s41591-018-0310-5</pub-id><pub-id pub-id-type="medline">30617332</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lundberg</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>SI</given-names> </name></person-group><article-title>A unified approach to interpreting model predictions</article-title><conf-name>31st Conference on Neural Information Processing Systems (NIPS 2017)</conf-name><conf-date>Dec 4-9, 2017</conf-date><pub-id pub-id-type="doi">10.48550/arXiv.1705.07874</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Ribeiro</surname><given-names>MT</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>S</given-names> </name><name name-style="western"><surname>Guestrin</surname><given-names>C</given-names> </name></person-group><article-title>&#x201C;Why should i trust you?&#x201D;: explaining the predictions of any classifier</article-title><conf-name>Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</conf-name><conf-date>Aug 13-17, 2016</conf-date><conf-loc>San Francisco, CA</conf-loc><fpage>1135</fpage><lpage>1144</lpage><pub-id pub-id-type="doi">10.1145/2939672.2939778</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Al-Hamadani</surname><given-names>MNA</given-names> </name><name name-style="western"><surname>Fadhel</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Alzubaidi</surname><given-names>L</given-names> </name><name name-style="western"><surname>Balazs</surname><given-names>H</given-names> </name></person-group><article-title>Reinforcement learning algorithms and applications in healthcare and robotics: a comprehensive and systematic review</article-title><source>Sensors (Basel)</source><year>2024</year><month>04</month><day>11</day><volume>24</volume><issue>8</issue><fpage>2461</fpage><pub-id pub-id-type="doi">10.3390/s24082461</pub-id><pub-id pub-id-type="medline">38676080</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Marling</surname><given-names>C</given-names> </name><name name-style="western"><surname>Bunescu</surname><given-names>R</given-names> </name></person-group><article-title>The OhioT1DM dataset for blood glucose level prediction: update 2020</article-title><source>CEUR Workshop Proc</source><year>2020</year><month>09</month><volume>2675</volume><fpage>71</fpage><lpage>74</lpage><pub-id pub-id-type="medline">33584164</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Marling</surname><given-names>C</given-names> </name><name name-style="western"><surname>Bunescu</surname><given-names>RC</given-names> </name></person-group><article-title>The OhioT1DM dataset for blood glucose level prediction: update 2020</article-title><source>CEUR Workshop Proc</source><year>2020</year><month>09</month><volume>2675</volume><fpage>71</fpage><lpage>74</lpage><pub-id pub-id-type="medline">33584164</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Sutton</surname><given-names>RS</given-names> </name><name name-style="western"><surname>Barto</surname><given-names>AG</given-names> </name></person-group><source>Reinforcement Learning: An Introduction</source><year>2018</year><edition>2</edition><publisher-name>MIT Press</publisher-name><pub-id pub-id-type="other">978-0262039246</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kober</surname><given-names>J</given-names> </name><name name-style="western"><surname>Bagnell</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Peters</surname><given-names>J</given-names> </name></person-group><article-title>Reinforcement learning in robotics: a survey</article-title><source>Int J Rob Res</source><year>2013</year><month>09</month><volume>32</volume><issue>11</issue><fpage>1238</fpage><lpage>1274</lpage><pub-id pub-id-type="doi">10.1177/0278364913495721</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Berny</surname><given-names>A</given-names> </name></person-group><article-title>Selection and reinforcement learning for combinatorial optimization</article-title><conf-name>Parallel Problem Solving from Nature PPSN VI: 6th International Conference</conf-name><conf-date>Sep 18-20, 2000</conf-date><conf-loc>Paris, France</conf-loc><fpage>601</fpage><lpage>610</lpage><pub-id pub-id-type="doi">10.1007/3-540-45356-3_59</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>van Eck</surname><given-names>NJ</given-names> </name><name name-style="western"><surname>van Wezel</surname><given-names>M</given-names> </name></person-group><article-title>Application of reinforcement learning to the game of Othello</article-title><source>Comput Oper Res</source><year>2008</year><month>06</month><volume>35</volume><issue>6</issue><fpage>1999</fpage><lpage>2017</lpage><pub-id pub-id-type="doi">10.1016/j.cor.2006.10.004</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Esteva</surname><given-names>A</given-names> </name><name name-style="western"><surname>Robicquet</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ramsundar</surname><given-names>B</given-names> </name><etal/></person-group><article-title>A guide to deep learning in healthcare</article-title><source>Nat Med</source><year>2019</year><month>01</month><volume>25</volume><issue>1</issue><fpage>24</fpage><lpage>29</lpage><pub-id pub-id-type="doi">10.1038/s41591-018-0316-z</pub-id><pub-id pub-id-type="medline">30617335</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Oroojeni Mohammad Javad</surname><given-names>M</given-names> </name><name name-style="western"><surname>Agboola</surname><given-names>SO</given-names> </name><name name-style="western"><surname>Jethwani</surname><given-names>K</given-names> </name><name name-style="western"><surname>Zeid</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kamarthi</surname><given-names>S</given-names> </name></person-group><article-title>A reinforcement learning-based method for management of type 1 diabetes: exploratory study</article-title><source>JMIR Diabetes</source><year>2019</year><month>08</month><day>28</day><volume>4</volume><issue>3</issue><fpage>e12905</fpage><pub-id pub-id-type="doi">10.2196/12905</pub-id><pub-id pub-id-type="medline">31464196</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Sammut</surname><given-names>C</given-names> </name><name name-style="western"><surname>Webb</surname><given-names>GI</given-names> </name></person-group><source>Encyclopedia of Machine Learning</source><year>2011</year><publisher-name>Springer</publisher-name><pub-id pub-id-type="doi">10.1007/978-0-387-30164-8</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Salkind</surname><given-names>NJ</given-names> </name></person-group><source>Encyclopedia of Research Design</source><year>2010</year><publisher-name>SAGE Publications, Inc</publisher-name><pub-id pub-id-type="other">1412961270</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Aleppo</surname><given-names>G</given-names> </name></person-group><article-title>Clinical application of time in range and other metrics</article-title><source>Diabetes Spectr</source><year>2021</year><month>05</month><volume>34</volume><issue>2</issue><fpage>109</fpage><lpage>118</lpage><pub-id pub-id-type="doi">10.2337/ds20-0093</pub-id><pub-id pub-id-type="medline">34149251</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="web"><article-title>OhioT1DM dataset</article-title><source>University of North Carolina at Charlotte</source><access-date>2026-05-23</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://webpages.charlotte.edu/rbunescu/data/ohiot1dm/OhioT1DM-dataset.html">https://webpages.charlotte.edu/rbunescu/data/ohiot1dm/OhioT1DM-dataset.html</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Descriptive statistics of glucose levels and cohort characteristics.</p><media xlink:href="diabetes_v11i1e79195_app1.docx" xlink:title="DOCX File, 20 KB"/></supplementary-material></app-group></back></article>