Applications
Outils et Frameworks
Guide complet des outils pour développer avec l'IA
Ce guide présente les principaux outils et frameworks pour développer des applications d'intelligence artificielle, du ML classique aux LLM.
Machine Learning classique
scikit-learn
La bibliothèque de référence pour le ML en Python.
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
# Pipeline complet
pipeline = Pipeline([
('scaler', StandardScaler()),
('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])
# Split des données
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Validation croisée
scores = cross_val_score(pipeline, X_train, y_train, cv=5)
print(f"CV Score: {scores.mean():.3f} (+/- {scores.std():.3f})")
# Entraînement et mesure de performance
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))Points forts :
- API cohérente et intuitive
- Documentation excellente
- Large choix d'algorithmes
- Intégration facile avec pandas/numpy
XGBoost / LightGBM
Algorithmes de gradient boosting haute performance.
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
# Configuration XGBoost
params = {
'max_depth': [3, 5, 7],
'learning_rate': [0.01, 0.1],
'n_estimators': [100, 200],
'subsample': [0.8, 1.0]
}
model = xgb.XGBClassifier(
objective='binary:logistic',
use_label_encoder=False
)
# Grid Search
grid_search = GridSearchCV(model, params, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)
print(f"Best params: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.4f}")import lightgbm as lgb
# LightGBM avec early stopping
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
params = {
'objective': 'binary',
'metric': 'auc',
'boosting_type': 'gbdt',
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9
}
model = lgb.train(
params,
train_data,
num_boost_round=1000,
valid_sets=[valid_data],
callbacks=[lgb.early_stopping(stopping_rounds=50)]
)Deep Learning
PyTorch
Le framework le plus populaire en recherche.
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
# Définition du modèle
class NeuralNetwork(nn.Module):
def __init__(self, input_size, hidden_size, num_classes):
super().__init__()
self.layers = nn.Sequential(
nn.Linear(input_size, hidden_size),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(hidden_size, hidden_size),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(hidden_size, num_classes)
)
def forward(self, x):
return self.layers(x)
# Configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = NeuralNetwork(784, 256, 10).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Boucle d'entraînement
def train_epoch(model, dataloader, criterion, optimizer):
model.train()
total_loss = 0
for batch_x, batch_y in dataloader:
batch_x, batch_y = batch_x.to(device), batch_y.to(device)
optimizer.zero_grad()
outputs = model(batch_x)
loss = criterion(outputs, batch_y)
loss.backward()
optimizer.step()
total_loss += loss.item()
return total_loss / len(dataloader)
# Mesure de performance
def measure_accuracy(model, dataloader):
model.train(False) # Mode inférence
correct = 0
total = 0
with torch.no_grad():
for batch_x, batch_y in dataloader:
batch_x, batch_y = batch_x.to(device), batch_y.to(device)
outputs = model(batch_x)
_, predicted = torch.max(outputs, 1)
total += batch_y.size(0)
correct += (predicted == batch_y).sum().item()
return correct / totalTensorFlow / Keras
Production-ready avec écosystème complet.
import tensorflow as tf
from tensorflow import keras
from keras import layers
# Modèle Keras
model = keras.Sequential([
layers.Dense(256, activation='relu', input_shape=(784,)),
layers.Dropout(0.3),
layers.Dense(128, activation='relu'),
layers.Dropout(0.3),
layers.Dense(10, activation='softmax')
])
model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
# Callbacks
callbacks = [
keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True),
keras.callbacks.ModelCheckpoint('best_model.keras', save_best_only=True),
keras.callbacks.TensorBoard(log_dir='./logs')
]
# Entraînement
history = model.fit(
X_train, y_train,
epochs=50,
batch_size=32,
validation_split=0.2,
callbacks=callbacks
)
# Sauvegarde et chargement
model.save('model.keras')
loaded_model = keras.models.load_model('model.keras')Comparaison PyTorch vs TensorFlow
| Aspect | PyTorch | TensorFlow |
|---|---|---|
| Syntaxe | Pythonique | Plus verbeux |
| Debug | Facile (eager) | Moins intuitif |
| Recherche | Dominant | Moins utilisé |
| Production | TorchServe | TensorFlow Serving |
| Mobile | PyTorch Mobile | TensorFlow Lite |
| Écosystème | Hugging Face | TFX, TF Extended |
Hugging Face Transformers
La référence pour les modèles pré-entraînés.
Utilisation basique
from transformers import pipeline
# Classification de texte
classifier = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")
result = classifier("Ce produit est vraiment excellent !")
print(result) # [{'label': '5 stars', 'score': 0.72}]
# Génération de texte
generator = pipeline("text-generation", model="gpt2")
text = generator("L'intelligence artificielle", max_length=50)
# Question-réponse
qa = pipeline("question-answering")
result = qa(
question="Qu'est-ce que le machine learning ?",
context="Le machine learning est une branche de l'IA qui permet aux machines d'apprendre à partir de données."
)
# Traduction
translator = pipeline("translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr")
translation = translator("Hello, how are you?")Fine-tuning avec Trainer
from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
TrainingArguments,
Trainer
)
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score
# Charger modèle et tokenizer
model_name = "camembert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
# Préparer les données
dataset = load_dataset("allocine")
def tokenize_function(examples):
return tokenizer(examples["review"], padding="max_length", truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Métriques
def compute_metrics(pred_output):
logits, labels = pred_output
predictions = np.argmax(logits, axis=-1)
return {"accuracy": accuracy_score(labels, predictions)}
# Configuration
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=64,
warmup_steps=500,
weight_decay=0.01,
logging_dir="./logs",
save_strategy="epoch"
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"],
compute_metrics=compute_metrics
)
trainer.train()APIs LLM
Anthropic Claude
from anthropic import Anthropic
client = Anthropic()
# Message simple
message = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1024,
system="Tu es un assistant expert en Python.",
messages=[
{"role": "user", "content": "Explique les décorateurs Python"}
]
)
print(message.content[0].text)
# Conversation multi-tours
messages = []
def chat(user_message):
messages.append({"role": "user", "content": user_message})
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1024,
messages=messages
)
assistant_message = response.content[0].text
messages.append({"role": "assistant", "content": assistant_message})
return assistant_message
# Streaming
with client.messages.stream(
model="claude-sonnet-4-20250514",
max_tokens=1024,
messages=[{"role": "user", "content": "Écris un poème"}]
) as stream:
for text in stream.text_stream:
print(text, end="", flush=True)OpenAI GPT
from openai import OpenAI
client = OpenAI()
# Chat completion
response = client.chat.completions.create(
model="gpt-4",
messages=[
{"role": "system", "content": "Tu es un assistant utile."},
{"role": "user", "content": "Qu'est-ce que le deep learning ?"}
],
temperature=0.7,
max_tokens=500
)
print(response.choices[0].message.content)
# Avec function calling
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Obtenir la météo d'une ville",
"parameters": {
"type": "object",
"properties": {
"city": {"type": "string", "description": "Nom de la ville"},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
},
"required": ["city"]
}
}
}
]
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": "Quel temps fait-il à Paris ?"}],
tools=tools,
tool_choice="auto"
)
# Embeddings
response = client.embeddings.create(
model="text-embedding-3-small",
input="Texte à vectoriser"
)
embedding = response.data[0].embeddingMistral AI
from mistralai import Mistral
client = Mistral(api_key="...")
# Chat
response = client.chat.complete(
model="mistral-large-latest",
messages=[
{"role": "user", "content": "Explique le machine learning"}
]
)
print(response.choices[0].message.content)
# Streaming
for chunk in client.chat.stream(
model="mistral-medium-latest",
messages=[{"role": "user", "content": "Raconte une histoire"}]
):
if chunk.data.choices[0].delta.content:
print(chunk.data.choices[0].delta.content, end="")
# Embeddings
response = client.embeddings.create(
model="mistral-embed",
inputs=["Premier texte", "Deuxième texte"]
)Bases de données vectorielles
Chroma
Simple et léger, idéal pour le développement.
import chromadb
from chromadb.utils import embedding_functions
# Client persistant
client = chromadb.PersistentClient(path="./chroma_db")
# Fonction d'embedding OpenAI
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
api_key="...",
model_name="text-embedding-3-small"
)
# Créer une collection
collection = client.get_or_create_collection(
name="documents",
embedding_function=openai_ef,
metadata={"hnsw:space": "cosine"}
)
# Ajouter des documents
collection.add(
documents=["Premier document", "Deuxième document"],
metadatas=[{"source": "doc1"}, {"source": "doc2"}],
ids=["id1", "id2"]
)
# Rechercher
results = collection.query(
query_texts=["Ma recherche"],
n_results=5,
where={"source": "doc1"} # Filtre optionnel
)Pinecone
Managed, scalable, production-ready.
from pinecone import Pinecone
pc = Pinecone(api_key="...")
# Créer un index
pc.create_index(
name="my-index",
dimension=1536,
metric="cosine",
spec={"serverless": {"cloud": "aws", "region": "us-east-1"}}
)
index = pc.Index("my-index")
# Upsert des vecteurs
index.upsert(
vectors=[
{"id": "vec1", "values": [0.1, 0.2, ...], "metadata": {"text": "..."}},
{"id": "vec2", "values": [0.3, 0.4, ...], "metadata": {"text": "..."}}
],
namespace="my-namespace"
)
# Recherche
results = index.query(
vector=[0.1, 0.2, ...],
top_k=10,
include_metadata=True,
namespace="my-namespace"
)pgvector (PostgreSQL)
Intégration native avec PostgreSQL.
import psycopg2
from pgvector.psycopg2 import register_vector
# Connexion
conn = psycopg2.connect("postgresql://...")
register_vector(conn)
cur = conn.cursor()
# Créer la table
cur.execute("""
CREATE TABLE IF NOT EXISTS documents (
id SERIAL PRIMARY KEY,
content TEXT,
embedding vector(1536)
)
""")
# Créer l'index
cur.execute("""
CREATE INDEX ON documents
USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100)
""")
# Insérer
cur.execute(
"INSERT INTO documents (content, embedding) VALUES (%s, %s)",
("Mon texte", embedding)
)
# Recherche par similarité
cur.execute("""
SELECT content, 1 - (embedding <=> %s) as similarity
FROM documents
ORDER BY embedding <=> %s
LIMIT 5
""", (query_embedding, query_embedding))Frameworks d'orchestration
LangChain
Framework complet pour applications LLM.
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
# Charger un PDF
loader = PyPDFLoader("document.pdf")
documents = loader.load()
# Découper en chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
splits = text_splitter.split_documents(documents)
# Créer le vectorstore
vectorstore = Chroma.from_documents(
documents=splits,
embedding=OpenAIEmbeddings()
)
# Créer la chaîne RAG
qa_chain = RetrievalQA.from_chain_type(
llm=ChatOpenAI(model="gpt-4"),
chain_type="stuff",
retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
return_source_documents=True
)
# Requête
result = qa_chain.invoke({"query": "Ma question"})
print(result["result"])LlamaIndex
Spécialisé dans l'indexation et la recherche.
from llama_index.core import (
VectorStoreIndex,
SimpleDirectoryReader,
Settings
)
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
# Configuration globale
Settings.llm = OpenAI(model="gpt-4")
Settings.embed_model = OpenAIEmbedding()
# Charger les documents
documents = SimpleDirectoryReader("./data").load_data()
# Créer l'index
index = VectorStoreIndex.from_documents(documents)
# Query engine simple
query_engine = index.as_query_engine()
response = query_engine.query("Ma question")
print(response)
# Chat engine pour conversation
chat_engine = index.as_chat_engine(chat_mode="condense_question")
response = chat_engine.chat("Bonjour, parle-moi du sujet")MLOps et déploiement
MLflow
Tracking et gestion du cycle de vie ML.
import mlflow
import mlflow.sklearn
# Démarrer une expérience
mlflow.set_experiment("mon-experiment")
with mlflow.start_run():
# Logger les paramètres
mlflow.log_param("n_estimators", 100)
mlflow.log_param("max_depth", 5)
# Entraîner le modèle
model = RandomForestClassifier(n_estimators=100, max_depth=5)
model.fit(X_train, y_train)
# Logger les métriques
accuracy = model.score(X_test, y_test)
mlflow.log_metric("accuracy", accuracy)
# Logger le modèle
mlflow.sklearn.log_model(model, "model")
# Logger des artefacts
mlflow.log_artifact("confusion_matrix.png")
# Charger un modèle
loaded_model = mlflow.sklearn.load_model("runs:/<run_id>/model")Weights & Biases
Tracking avancé et visualisation.
import wandb
# Initialiser
wandb.init(project="mon-projet", config={
"learning_rate": 0.001,
"epochs": 100,
"batch_size": 32
})
# Logger pendant l'entraînement
for epoch in range(100):
train_loss = train_epoch()
val_loss = validate()
wandb.log({
"train_loss": train_loss,
"val_loss": val_loss,
"epoch": epoch
})
# Logger des images, tableaux, etc.
wandb.log({"predictions": wandb.Table(dataframe=df)})
wandb.log({"image": wandb.Image(img)})
wandb.finish()Docker pour le déploiement
FROM python:3.11-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
EXPOSE 8000
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]FastAPI pour les APIs ML
from fastapi import FastAPI
from pydantic import BaseModel
import joblib
app = FastAPI()
# Charger le modèle
model = joblib.load("model.joblib")
class PredictionInput(BaseModel):
features: list[float]
class PredictionOutput(BaseModel):
prediction: int
probability: float
@app.post("/predict", response_model=PredictionOutput)
def predict(input_data: PredictionInput):
prediction = model.predict([input_data.features])[0]
probability = model.predict_proba([input_data.features]).max()
return PredictionOutput(
prediction=int(prediction),
probability=float(probability)
)
@app.get("/health")
def health():
return {"status": "healthy"}Résumé
| Catégorie | Outils | Usage |
|---|---|---|
| ML classique | scikit-learn, XGBoost | Classification, régression |
| Deep Learning | PyTorch, TensorFlow | Réseaux de neurones |
| NLP | Hugging Face | Modèles pré-entraînés |
| APIs LLM | Anthropic, OpenAI, Mistral | Génération de texte |
| Vector DBs | Chroma, Pinecone, pgvector | Recherche sémantique |
| Orchestration | LangChain, LlamaIndex | Applications RAG |
| MLOps | MLflow, W&B | Tracking, déploiement |
| Serving | FastAPI, Docker | Production |