datawhalechina
diff --git a/‎.github/workflows/deploy.yml
Lines changed: 63 additions & 0 deletions b/‎.github/workflows/deploy.yml
Lines changed: 63 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 3 additions & 1 deletion b/‎README.md
Lines changed: 3 additions & 1 deletion
diff --git a/‎datasets/MNIST/raw/__pycache__/load_data.cpython-311.pyc
6.03 KB b/‎datasets/MNIST/raw/__pycache__/load_data.cpython-311.pyc
6.03 KB
diff --git a/‎datasets/MNIST/raw/load_data.py
Lines changed: 120 additions & 0 deletions b/‎datasets/MNIST/raw/load_data.py
Lines changed: 120 additions & 0 deletions
diff --git a/‎datasets/MNIST/raw/t10k-images-idx3-ubyte
7.48 MB b/‎datasets/MNIST/raw/t10k-images-idx3-ubyte
7.48 MB
diff --git a/‎datasets/MNIST/raw/t10k-images-idx3-ubyte.gz
1.57 MB b/‎datasets/MNIST/raw/t10k-images-idx3-ubyte.gz
1.57 MB
diff --git a/‎datasets/MNIST/raw/t10k-labels-idx1-ubyte
9.77 KB b/‎datasets/MNIST/raw/t10k-labels-idx1-ubyte
9.77 KB
diff --git a/‎datasets/MNIST/raw/t10k-labels-idx1-ubyte.gz
4.44 KB b/‎datasets/MNIST/raw/t10k-labels-idx1-ubyte.gz
4.44 KB
diff --git a/‎datasets/MNIST/raw/train-images-idx3-ubyte
44.9 MB b/‎datasets/MNIST/raw/train-images-idx3-ubyte
44.9 MB
diff --git a/‎datasets/MNIST/raw/train-images-idx3-ubyte.gz
9.45 MB b/‎datasets/MNIST/raw/train-images-idx3-ubyte.gz
9.45 MB
diff --git a/‎datasets/MNIST/raw/train-labels-idx1-ubyte
58.6 KB b/‎datasets/MNIST/raw/train-labels-idx1-ubyte
58.6 KB
diff --git a/‎datasets/MNIST/raw/train-labels-idx1-ubyte.gz
28.2 KB b/‎datasets/MNIST/raw/train-labels-idx1-ubyte.gz
28.2 KB
diff --git a/‎datasets/README.ipynb
Lines changed: 163 additions & 0 deletions b/‎datasets/README.ipynb
Lines changed: 163 additions & 0 deletions
diff --git a/‎docs/chapter2/逻辑回归.md
Lines changed: 43 additions & 22 deletions b/‎docs/chapter2/逻辑回归.md
Lines changed: 43 additions & 22 deletions
@@ -0,0 +1,63 @@
+name: Deploy to GitHub Pages
+
+on:
+  push:
+    branches: [main]
+  workflow_dispatch:
+
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+
+concurrency:
+  group: "pages"
+  cancel-in-progress: false
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      
+      - name: Setup Node
+        uses: actions/setup-node@v4
+        with:
+          node-version: '18'
+          cache: npm
+          cache-dependency-path: './website/package-lock.json'
+      
+      - name: Setup Pages
+        uses: actions/configure-pages@v4
+      
+      - name: Install dependencies
+        run: |
+          cd website
+          npm ci
+          npm list vitepress
+      
+      - name: Build
+        run: |
+          cd website
+          npm run docs:build
+          ls -la docs/.vitepress/dist
+      
+      - name: Upload artifact
+        uses: actions/upload-pages-artifact@v3
+        with:
+          path: website/docs/.vitepress/dist
+
+  deploy:
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    needs: build
+    runs-on: ubuntu-latest
+    name: Deploy
+    steps:
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v4 
@@ -145,4 +145,6 @@ simple-ml-code/
 
 ## 致谢
 
-感谢所有为这个教程做出贡献的开发者们！
+感谢https://github.com/datawhalechina/machine-learning-toy-code 教程的创作者们  
+
+感谢https://github.com/datawhalechina/pumpkin-book 南瓜书作者Sm1les
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Author: JiangJi
+Email: [email protected]
+Date: 2023-01-30 09:31:34
+LastEditor: JiangJi
+LastEditTime: 2023-01-30 09:31:35
+Discription: 
+'''
+'''
+此脚本提供两种方法，一种为load_local_mnist，即将本地的.gz文件解码为数据，
+一种是利用keras在线下载mnist
+'''
+import numpy as np
+from struct import unpack
+import gzip
+import os
+
+
+def __read_image(path):
+    with gzip.open(path, 'rb') as f:
+        magic, num, rows, cols = unpack('>4I', f.read(16))
+        img = np.frombuffer(f.read(), dtype=np.uint8).reshape(num, 28*28)
+    return img
+
+
+def __read_label(path):
+    with gzip.open(path, 'rb') as f:
+        magic, num = unpack('>2I', f.read(8))
+        lab = np.frombuffer(f.read(), dtype=np.uint8)
+        # print(lab[1])
+    return lab
+
+
+def __normalize_image(image):
+    '''__normalize_image 将image的像素值(0-255)归一化
+    Args:
+        image ([type]): [description]
+    Returns:
+        [type]: [description]
+    '''
+    img = image.astype(np.float32) / 255.0
+    return img
+
+
+def __one_hot_label(label):
+    '''__one_hot_label 将label进行one-hot编码
+    Args:
+        label ([type]): 输入为0-9，表示数字标签
+    Returns:
+        [type]: 输出为二进制编码，比如[0,0,1,0,0,0,0,0,0,0]表示数字2
+    '''
+    lab = np.zeros((label.size, 10))
+    for i, row in enumerate(lab):
+        row[label[i]] = 1
+    return lab
+
+
+def load_local_mnist(x_train_path=os.path.dirname(__file__)+'/train-images-idx3-ubyte.gz', y_train_path=os.path.dirname(__file__)+'/train-labels-idx1-ubyte.gz', x_test_path=os.path.dirname(__file__)+'/t10k-images-idx3-ubyte.gz', y_test_path=os.path.dirname(__file__)+'/t10k-labels-idx1-ubyte.gz', normalize=True, one_hot=True):
+    '''load_mnist 读取.gz格式的MNIST数据集
+    Args:
+        x_train_path ([type]): [description]
+        y_train_path ([type]): [description]
+        x_test_path ([type]): [description]
+        y_test_path ([type]): [description]
+        normalize (bool, optional): [description]. Defaults to True.
+        one_hot (bool, optional): one_hot为True的情况下，标签作为one-hot数组返回
+                                  one-hot数组是指[0,0,1,0,0,0,0,0,0,0]这样的数组
+    Returns:
+        [type]: (训练图像, 训练标签), (测试图像, 测试标签)
+        训练集数量为60000，每行包含维度为784=28*28的向量
+    '''
+    image = {
+        'train': __read_image(x_train_path),
+        'test': __read_image(x_test_path)
+    }
+
+    label = {
+        'train': __read_label(y_train_path),
+        'test': __read_label(y_test_path)
+    }
+
+    if normalize:
+        for key in ('train', 'test'):
+            image[key] = __normalize_image(image[key])
+
+    if one_hot:
+        for key in ('train', 'test'):
+            label[key] = __one_hot_label(label[key])
+
+    return (image['train'], label['train']), (image['test'], label['test'])
+
+
+
+def load_online_data():  # categorical_crossentropy
+    from keras.datasets import mnist
+    from keras.utils import np_utils
+    import numpy as np
+    (x_train, y_train), (x_test, y_test) = mnist.load_data()
+    number = 10000
+    x_train, y_train = x_train[0:number], y_train[0:number]
+    x_train = x_train.reshape(number, 28 * 28)
+    x_test = x_test.reshape(x_test.shape[0], 28 * 28)
+    x_train = x_train.astype('float32')
+    x_test = x_test.astype('float32')
+
+    # convert class vectors to binary class matrices
+    y_train = np_utils.to_categorical(y_train, 10)
+    y_test = np_utils.to_categorical(y_test, 10)
+    x_test = np.random.normal(x_test)  # 加噪声
+
+    x_train, x_test = x_train / 255, x_test / 255
+
+    return (x_train, y_train), (x_test, y_test)
+
+
+if __name__ == "__main__":
+
+    (x_train, y_train), (x_test, y_test) = load_local_mnist()
@@ -1,8 +1,8 @@
 # 逻辑回归
 ## 代码块    
     import numpy as np
-    from sklearn.datasets import fetch_openml
     from sklearn.linear_model import LogisticRegression
+    import os
 ## 逐行解释
     import numpy as np
 这行代码导入了numpy库，别名为np。numpy是一个强大的数学计算库，就像是我们的计算器，帮助我们进行各种数值运算。  
@@ -20,32 +20,53 @@
 、函数、方法等，声明了这些东西和它们的用法后，拿来做什么事情，就像是我在文章中交代时间地点人物，然后用它们告诉你，这里发生了什么事情，所以，不要畏惧语言本身，尝试着去将它与你自身经历结合在一起，你会发现广阔天地。
 
 ## 加载数据集
-    mnist=fetch_openml('mnist_784')
-    X,y=mnist['data'],mnist['target']
-    X_train=np.array(X[:60000],dtype=float)
-    y_train=np.array(y[:60000],dtype=float)
-    X_test=np.array(X[60000:],dtype=float)
-    y_test=np.array(y[60000:],dtype=float)
+    # 从本地加载MNIST数据集
+    def load_mnist_data():
+        from datasets.MNIST.raw.load_data import load_local_mnist
+        base_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'datasets', 'MNIST', 'raw')
+        (X_train, y_train), (X_test, y_test) = load_local_mnist(
+            x_train_path=os.path.join(base_path, 'train-images-idx3-ubyte.gz'),
+            y_train_path=os.path.join(base_path, 'train-labels-idx1-ubyte.gz'),
+            x_test_path=os.path.join(base_path, 't10k-images-idx3-ubyte.gz'),
+            y_test_path=os.path.join(base_path, 't10k-labels-idx1-ubyte.gz'),
+            normalize=True,
+            one_hot=False
+        )
+        return X_train, y_train, X_test, y_test
+
+    # 加载数据
+    X_train, y_train, X_test, y_test = load_mnist_data()
 ## 逐行解释
-    mnist=fetch_openml('mnist_784')
-这行代码通过fetch_openml函数加载了名为mnist_784的数据集，该数据集是一个包含手写数字（0-9）的图像数据集，784表示每个图像有784个像素值（28*28像素）。
-mnist是加载后的数据集对象，即把数据集赋值给mnist，此后这个mnist就代表这个数据集，就像是我们说小明很擅长数学，那一提到小明我们就说他数学很好。  
+    def load_mnist_data():
+这行代码定义了一个名为load_mnist_data的函数，用于加载本地MNIST数据集。函数的作用就像是一个专门的工具人，我们告诉它数据在哪里，它就帮我们把数据取出来。
 
-    X,y=mnist['data'],mnist['target']
-这行代码将mnist数据集中的数据分成特征X和标签y，mnist['data']包含了图像数据（每个图像有784个像素），mnist['target']包含了这些图像对应的标签（即它们表示的数字，0-9）。
+    from datasets.MNIST.raw.load_data import load_local_mnist
+    base_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'datasets', 'MNIST', 'raw')
+这两行代码首先导入了我们自定义的load_local_mnist函数，然后设置了数据集的路径。base_path就像是一个地图，告诉程序去哪里找数据文件。os.path.join函数就像是在帮我们连接路径的各个部分，确保在不同的操作系统上都能正确找到文件。
 
-让我们用生活中的例子来理解特征和标签：
-特征是事物的特点，就像人的身高、体重、年龄等。在图像识别中，特征就是图像的像素值。
-标签则是我们要预测的目标，比如根据一个人的特征判断他是学生还是老师，这里的"学生"和"老师"就是标签。
+    (X_train, y_train), (X_test, y_test) = load_local_mnist(
+        x_train_path=os.path.join(base_path, 'train-images-idx3-ubyte.gz'),
+        y_train_path=os.path.join(base_path, 'train-labels-idx1-ubyte.gz'),
+        x_test_path=os.path.join(base_path, 't10k-images-idx3-ubyte.gz'),
+        y_test_path=os.path.join(base_path, 't10k-labels-idx1-ubyte.gz'),
+        normalize=True,
+        one_hot=False
+    )
+这段代码调用load_local_mnist函数来加载数据。它需要四个文件路径参数：
+- train-images-idx3-ubyte.gz：训练图像数据
+- train-labels-idx1-ubyte.gz：训练标签数据
+- t10k-images-idx3-ubyte.gz：测试图像数据
+- t10k-labels-idx1-ubyte.gz：测试标签数据
 
-    X_train=np.array(X[:60000],dtype=float)
-    y_train=np.array(y[:60000],dtype=float)
-我们在这里使用mnist_784的前60000个样本用于训练，第一行代码表示将X（特征）赋值给X_train，np.array(...,dtype=float)表示把数据转换成Numpy数组，并且指定数据类型为float（浮点型），转换成Numpy数组是为了更好地训练，数据类型为float是因为机器学习算法要求输入数据是浮点数类型，那第二行同学们就能自己推测出来了吧，将y中的前60000个样本分别赋值给y_train，并把数据转成Numpy数组且数据类型为浮点数float。  
+normalize=True表示我们要对图像数据进行归一化处理，将像素值从0-255变成0-1之间的小数，这样可以让模型训练更稳定。
+one_hot=False表示我们不使用独热编码来表示标签，而是直接使用0-9的数字标签。
 
-    X_test=np.array(X[60000:],dtype=float)
-    y_test=np.array(y[60000:],dtype=float)
-经过了上一段代码的学习，同学们是否能推测出这一段代码的意思呢？  
-将从第60001个开始的样本，用作测试数据，分别赋值给X_test和y_test，这就是我们用来测试模型性能的测试集啦，并且数据类型为浮点型，且数据转换为Numpy数组。
+    X_train, y_train, X_test, y_test = load_mnist_data()
+这行代码调用我们定义的函数来获取数据。数据集被分成了训练集（X_train, y_train）和测试集（X_test, y_test）：
+- X_train：训练图像数据，包含60000张图片
+- y_train：训练图片对应的标签
+- X_test：测试图像数据，包含10000张图片
+- y_test：测试图片对应的标签
 
     print(X_train.shape)
     print(y_train.shape)